You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/16 19:23:06 UTC

svn commit: r1725014 [20/28] - in /tika/branches/2.x: tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-parser-module/ tika-parser-modules/tika-advanced-parser-m...

Added: tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,721 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.Writer;
+import java.text.SimpleDateFormat;
+import java.util.Calendar;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.commons.io.IOUtils;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
+import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
+import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDResources;
+import org.apache.pdfbox.pdmodel.common.COSObjectable;
+import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDCcitt;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDJpeg;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDPixelMap;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectForm;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
+import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
+import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
+import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
+import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
+import org.apache.pdfbox.pdmodel.interactive.form.PDField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
+import org.apache.pdfbox.util.PDFTextStripper;
+import org.apache.pdfbox.util.TextPosition;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Utility class that overrides the {@link PDFTextStripper} functionality
+ * to produce a semi-structured XHTML SAX events instead of a plain text
+ * stream.
+ */
+class PDF2XHTML extends PDFTextStripper {
+
+    /**
+     * Maximum recursive depth during AcroForm processing.
+     * Prevents theoretical AcroForm recursion bomb.
+     */
+    private final static int MAX_ACROFORM_RECURSIONS = 10;
+    /**
+     * Format used for signature dates
+     * TODO Make this thread-safe
+     */
+    private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
+    private final ContentHandler originalHandler;
+    private final ParseContext context;
+    private final XHTMLContentHandler handler;
+    private final PDFParserConfig config;
+    /**
+     * This keeps track of the pdf object ids for inline
+     * images that have been processed.
+     * If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly()
+     * is true, this will be checked before extracting an embedded image.
+     * The integer keeps track of the inlineImageCounter for that image.
+     * This integer is used to identify images in the markup.
+     *
+     * This is used across the document.  To avoid infinite recursion
+     * TIKA-1742, we're limiting the export to one image per page.
+     */
+    private Map<String, Integer> processedInlineImages = new HashMap<>();
+    private int inlineImageCounter = 0;
+    private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata,
+                      PDFParserConfig config)
+            throws IOException {
+        //source of config (derives from context or PDFParser?) is
+        //already determined in PDFParser.  No need to check context here.
+        this.config = config;
+        this.originalHandler = handler;
+        this.context = context;
+        this.handler = new XHTMLContentHandler(handler, metadata);
+    }
+
+    /**
+     * Converts the given PDF document (and related metadata) to a stream
+     * of XHTML SAX events sent to the given content handler.
+     *
+     * @param document PDF document
+     * @param handler  SAX content handler
+     * @param metadata PDF metadata
+     * @throws SAXException  if the content handler fails to process SAX events
+     * @throws TikaException if the PDF document can not be processed
+     */
+    public static void process(
+            PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
+            PDFParserConfig config)
+            throws SAXException, TikaException {
+        try {
+            // Extract text using a dummy Writer as we override the
+            // key methods to output to the given content
+            // handler.
+            PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata, config);
+
+            config.configure(pdf2XHTML);
+
+            pdf2XHTML.writeText(document, new Writer() {
+                @Override
+                public void write(char[] cbuf, int off, int len) {
+                }
+
+                @Override
+                public void flush() {
+                }
+
+                @Override
+                public void close() {
+                }
+            });
+
+        } catch (IOException e) {
+            if (e.getCause() instanceof SAXException) {
+                throw (SAXException) e.getCause();
+            } else {
+                throw new TikaException("Unable to extract PDF content", e);
+            }
+        }
+    }
+
+    void extractBookmarkText() throws SAXException {
+        PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline();
+        if (outline != null) {
+            extractBookmarkText(outline);
+        }
+    }
+
+    void extractBookmarkText(PDOutlineNode bookmark) throws SAXException {
+        PDOutlineItem current = bookmark.getFirstChild();
+        if (current != null) {
+            handler.startElement("ul");
+            while (current != null) {
+                handler.startElement("li");
+                handler.characters(current.getTitle());
+                handler.endElement("li");
+                // Recurse:
+                extractBookmarkText(current);
+                current = current.getNextSibling();
+            }
+            handler.endElement("ul");
+        }
+    }
+
+    @Override
+    protected void startDocument(PDDocument pdf) throws IOException {
+        try {
+            handler.startDocument();
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to start a document", e);
+        }
+    }
+
+    @Override
+    protected void endDocument(PDDocument pdf) throws IOException {
+        try {
+            // Extract text for any bookmarks:
+            extractBookmarkText();
+            extractEmbeddedDocuments(pdf, originalHandler);
+
+            //extract acroform data at end of doc
+            if (config.getExtractAcroFormContent() == true) {
+                extractAcroForm(pdf, handler);
+            }
+            handler.endDocument();
+        } catch (TikaException e) {
+            throw new IOExceptionWithCause("Unable to end a document", e);
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to end a document", e);
+        }
+    }
+
+    @Override
+    protected void startPage(PDPage page) throws IOException {
+        try {
+            handler.startElement("div", "class", "page");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to start a page", e);
+        }
+        writeParagraphStart();
+    }
+
+    @Override
+    protected void endPage(PDPage page) throws IOException {
+        try {
+            writeParagraphEnd();
+
+            extractImages(page.getResources(), new HashSet<COSBase>());
+
+            EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
+            for (PDAnnotation annotation : page.getAnnotations()) {
+
+                if (annotation instanceof PDAnnotationFileAttachment) {
+                    PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
+                    PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
+                    try {
+                        extractMultiOSPDEmbeddedFiles("", fileSpec, extractor);
+                    } catch (SAXException e) {
+                        throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
+                    } catch (TikaException e) {
+                        throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
+                    }
+                }
+                // TODO: remove once PDFBOX-1143 is fixed:
+                if (config.getExtractAnnotationText()) {
+                    if (annotation instanceof PDAnnotationLink) {
+                        PDAnnotationLink annotationlink = (PDAnnotationLink) annotation;
+                        if (annotationlink.getAction() != null) {
+                            PDAction action = annotationlink.getAction();
+                            if (action instanceof PDActionURI) {
+                                PDActionURI uri = (PDActionURI) action;
+                                String link = uri.getURI();
+                                if (link != null) {
+                                    handler.startElement("div", "class", "annotation");
+                                    handler.startElement("a", "href", link);
+                                    handler.endElement("a");
+                                    handler.endElement("div");
+                                }
+                            }
+                        }
+                    }
+
+                    if (annotation instanceof PDAnnotationMarkup) {
+                        PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
+                        String title = annotationMarkup.getTitlePopup();
+                        String subject = annotationMarkup.getSubject();
+                        String contents = annotationMarkup.getContents();
+                        // TODO: maybe also annotationMarkup.getRichContents()?
+                        if (title != null || subject != null || contents != null) {
+                            handler.startElement("div", "class", "annotation");
+
+                            if (title != null) {
+                                handler.startElement("div", "class", "annotationTitle");
+                                handler.characters(title);
+                                handler.endElement("div");
+                            }
+
+                            if (subject != null) {
+                                handler.startElement("div", "class", "annotationSubject");
+                                handler.characters(subject);
+                                handler.endElement("div");
+                            }
+
+                            if (contents != null) {
+                                handler.startElement("div", "class", "annotationContents");
+                                handler.characters(contents);
+                                handler.endElement("div");
+                            }
+
+                            handler.endElement("div");
+                        }
+                    }
+                }
+            }
+
+            handler.endElement("div");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to end a page", e);
+        }
+        page.clear();
+    }
+
+    private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException {
+        if (resources == null || config.getExtractInlineImages() == false) {
+            return;
+        }
+
+        Map<String, PDXObject> xObjects = resources.getXObjects();
+        if (xObjects == null) {
+            return;
+        }
+
+        for (Map.Entry<String, PDXObject> entry : xObjects.entrySet()) {
+
+            PDXObject object = entry.getValue();
+            if (object == null) {
+                continue;
+            }
+            COSBase cosObject = object.getCOSObject();
+            if (seenThisPage.contains(cosObject)) {
+                //avoid infinite recursion TIKA-1742
+                continue;
+            }
+            seenThisPage.add(cosObject);
+
+            if (object instanceof PDXObjectForm) {
+                extractImages(((PDXObjectForm) object).getResources(), seenThisPage);
+            } else if (object instanceof PDXObjectImage) {
+
+                PDXObjectImage image = (PDXObjectImage) object;
+
+                Metadata metadata = new Metadata();
+                String extension = "";
+                if (image instanceof PDJpeg) {
+                    metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+                    extension = ".jpg";
+                } else if (image instanceof PDCcitt) {
+                    metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
+                    extension = ".tif";
+                } else if (image instanceof PDPixelMap) {
+                    metadata.set(Metadata.CONTENT_TYPE, "image/png");
+                    extension = ".png";
+                }
+
+                Integer imageNumber = processedInlineImages.get(entry.getKey());
+                if (imageNumber == null) {
+                    imageNumber = inlineImageCounter++;
+                }
+                String fileName = "image" + imageNumber + extension;
+                metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+
+                // Output the img tag
+                AttributesImpl attr = new AttributesImpl();
+                attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
+                attr.addAttribute("", "alt", "alt", "CDATA", fileName);
+                handler.startElement("img", attr);
+                handler.endElement("img");
+
+                //Do we only want to process unique COSObject ids?
+                //If so, have we already processed this one?
+                if (config.getExtractUniqueInlineImagesOnly() == true) {
+                    String cosObjectId = entry.getKey();
+                    if (processedInlineImages.containsKey(cosObjectId)) {
+                        continue;
+                    }
+                    processedInlineImages.put(cosObjectId, imageNumber);
+                }
+
+                metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                        TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+
+                EmbeddedDocumentExtractor extractor =
+                        getEmbeddedDocumentExtractor();
+                if (extractor.shouldParseEmbedded(metadata)) {
+                    ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+                    try {
+                        image.write2OutputStream(buffer);
+                        image.clear();
+                        extractor.parseEmbedded(
+                                new ByteArrayInputStream(buffer.toByteArray()),
+                                new EmbeddedContentHandler(handler),
+                                metadata, false);
+                    } catch (IOException e) {
+                        // could not extract this image, so just skip it...
+                    }
+                }
+            }
+        }
+        resources.clear();
+    }
+
+    protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {
+        EmbeddedDocumentExtractor extractor =
+                context.get(EmbeddedDocumentExtractor.class);
+        if (extractor == null) {
+            extractor = new ParsingEmbeddedDocumentExtractor(context);
+        }
+        return extractor;
+    }
+
+    @Override
+    protected void writeParagraphStart() throws IOException {
+        super.writeParagraphStart();
+        try {
+            handler.startElement("p");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to start a paragraph", e);
+        }
+    }
+
+    @Override
+    protected void writeParagraphEnd() throws IOException {
+        super.writeParagraphEnd();
+        try {
+            handler.endElement("p");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to end a paragraph", e);
+        }
+    }
+
+    @Override
+    protected void writeString(String text) throws IOException {
+        try {
+            handler.characters(text);
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(
+                    "Unable to write a string: " + text, e);
+        }
+    }
+
+    @Override
+    protected void writeCharacters(TextPosition text) throws IOException {
+        try {
+            handler.characters(text.getCharacter());
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(
+                    "Unable to write a character: " + text.getCharacter(), e);
+        }
+    }
+
+    @Override
+    protected void writeWordSeparator() throws IOException {
+        try {
+            handler.characters(getWordSeparator());
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(
+                    "Unable to write a space character", e);
+        }
+    }
+
+    @Override
+    protected void writeLineSeparator() throws IOException {
+        try {
+            handler.newline();
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(
+                    "Unable to write a newline character", e);
+        }
+    }
+
+    private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        PDDocumentCatalog catalog = document.getDocumentCatalog();
+        PDDocumentNameDictionary names = catalog.getNames();
+        if (names == null) {
+            return;
+        }
+        PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
+
+        if (embeddedFiles == null) {
+            return;
+        }
+
+        Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames();
+        //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
+        //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
+        //If there is a need we could add a fully recursive search to find a non-null
+        //Map<String, COSObjectable> that contains the doc info.
+        if (embeddedFileNames != null) {
+            processEmbeddedDocNames(embeddedFileNames);
+        } else {
+            List<PDNameTreeNode> kids = embeddedFiles.getKids();
+            if (kids == null) {
+                return;
+            }
+            for (PDNameTreeNode n : kids) {
+                Map<String, COSObjectable> childNames = n.getNames();
+                if (childNames != null) {
+                    processEmbeddedDocNames(childNames);
+                }
+            }
+        }
+    }
+
+
+    private void processEmbeddedDocNames(Map<String, COSObjectable> embeddedFileNames)
+            throws IOException, SAXException, TikaException {
+        if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
+            return;
+        }
+
+        EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
+        for (Map.Entry<String, COSObjectable> ent : embeddedFileNames.entrySet()) {
+            PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
+            extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor);
+        }
+    }
+
+    private void extractMultiOSPDEmbeddedFiles(String defaultName,
+                                               PDComplexFileSpecification spec, EmbeddedDocumentExtractor extractor) throws IOException,
+            SAXException, TikaException {
+
+        if (spec == null) {
+            return;
+        }
+        //current strategy is to pull all, not just first non-null
+        extractPDEmbeddedFile(defaultName, spec.getFile(), spec.getEmbeddedFile(), extractor);
+        extractPDEmbeddedFile(defaultName, spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
+        extractPDEmbeddedFile(defaultName, spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
+        extractPDEmbeddedFile(defaultName, spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
+    }
+
+    private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file,
+                                       EmbeddedDocumentExtractor extractor)
+            throws SAXException, IOException, TikaException {
+
+        if (file == null) {
+            //skip silently
+            return;
+        }
+
+        fileName = (fileName == null) ? defaultName : fileName;
+
+        // TODO: other metadata?
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+        metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
+        metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
+        metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
+
+        if (extractor.shouldParseEmbedded(metadata)) {
+            TikaInputStream stream = null;
+            try {
+                stream = TikaInputStream.get(file.createInputStream());
+                extractor.parseEmbedded(
+                        stream,
+                        new EmbeddedContentHandler(handler),
+                        metadata, false);
+
+                AttributesImpl attributes = new AttributesImpl();
+                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+                attributes.addAttribute("", "id", "id", "CDATA", fileName);
+                handler.startElement("div", attributes);
+                handler.endElement("div");
+            } finally {
+                IOUtils.closeQuietly(stream);
+            }
+        }
+    }
+
+    private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) throws IOException,
+            SAXException {
+        //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
+        //this code derives from Ben's code
+        PDDocumentCatalog catalog = pdf.getDocumentCatalog();
+
+        if (catalog == null)
+            return;
+
+        PDAcroForm form = catalog.getAcroForm();
+        if (form == null)
+            return;
+
+        @SuppressWarnings("rawtypes")
+        List fields = form.getFields();
+
+        if (fields == null)
+            return;
+
+        @SuppressWarnings("rawtypes")
+        ListIterator itr = fields.listIterator();
+
+        if (itr == null)
+            return;
+
+        handler.startElement("div", "class", "acroform");
+        handler.startElement("ol");
+
+        while (itr.hasNext()) {
+            Object obj = itr.next();
+            if (obj != null && obj instanceof PDField) {
+                processAcroField((PDField) obj, handler, 0);
+            }
+        }
+        handler.endElement("ol");
+        handler.endElement("div");
+    }
+
+    private void processAcroField(PDField field, XHTMLContentHandler handler, final int currentRecursiveDepth)
+            throws SAXException, IOException {
+
+        if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
+            return;
+        }
+
+        addFieldString(field, handler);
+
+        List<COSObjectable> kids = field.getKids();
+        if (kids != null) {
+
+            int r = currentRecursiveDepth + 1;
+            handler.startElement("ol");
+            //TODO: can generate <ol/>. Rework to avoid that.
+            for (COSObjectable pdfObj : kids) {
+                if (pdfObj != null && pdfObj instanceof PDField) {
+                    PDField kid = (PDField) pdfObj;
+                    //recurse
+                    processAcroField(kid, handler, r);
+                }
+            }
+            handler.endElement("ol");
+        }
+    }
+
+    private void addFieldString(PDField field, XHTMLContentHandler handler) throws SAXException {
+        //Pick partial name to present in content and altName for attribute
+        //Ignoring FullyQualifiedName for now
+        String partName = field.getPartialName();
+        String altName = field.getAlternateFieldName();
+
+        StringBuilder sb = new StringBuilder();
+        AttributesImpl attrs = new AttributesImpl();
+
+        if (partName != null) {
+            sb.append(partName).append(": ");
+        }
+        if (altName != null) {
+            attrs.addAttribute("", "altName", "altName", "CDATA", altName);
+        }
+        //return early if PDSignature field
+        if (field instanceof PDSignatureField) {
+            handleSignature(attrs, (PDSignatureField) field, handler);
+            return;
+        }
+        try {
+            //getValue can throw an IOException if there is no value
+            String value = field.getValue();
+            if (value != null && !value.equals("null")) {
+                sb.append(value);
+            }
+        } catch (IOException e) {
+            //swallow
+        }
+
+        if (attrs.getLength() > 0 || sb.length() > 0) {
+            handler.startElement("li", attrs);
+            handler.characters(sb.toString());
+            handler.endElement("li");
+        }
+    }
+
+    private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField,
+                                 XHTMLContentHandler handler) throws SAXException {
+
+
+        PDSignature sig = sigField.getSignature();
+        if (sig == null) {
+            return;
+        }
+        Map<String, String> vals = new TreeMap<String, String>();
+        vals.put("name", sig.getName());
+        vals.put("contactInfo", sig.getContactInfo());
+        vals.put("location", sig.getLocation());
+        vals.put("reason", sig.getReason());
+
+        Calendar cal = sig.getSignDate();
+        if (cal != null) {
+            dateFormat.setTimeZone(cal.getTimeZone());
+            vals.put("date", dateFormat.format(cal.getTime()));
+        }
+        //see if there is any data
+        int nonNull = 0;
+        for (String val : vals.keySet()) {
+            if (val != null && !val.equals("")) {
+                nonNull++;
+            }
+        }
+        //if there is, process it
+        if (nonNull > 0) {
+            handler.startElement("li", parentAttributes);
+
+            AttributesImpl attrs = new AttributesImpl();
+            attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");
+
+            handler.startElement("ol", attrs);
+            for (Map.Entry<String, String> e : vals.entrySet()) {
+                if (e.getValue() == null || e.getValue().equals("")) {
+                    continue;
+                }
+                attrs = new AttributesImpl();
+                attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey());
+                handler.startElement("li", attrs);
+                handler.characters(e.getValue());
+                handler.endElement("li");
+            }
+            handler.endElement("ol");
+            handler.endElement("li");
+        }
+    }
+}
+

Added: tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pdf;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.pdfbox.cos.COSString;
+import org.apache.pdfbox.pdfparser.BaseParser;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
+/**
+ * In fairly rare cases, a PDF's XMP will contain a string that
+ * has incorrectly been encoded with PDFEncoding: an octal for non-ascii and
+ * ascii for ascii, e.g. "\376\377\000M\000i\000c\000r\000o\000s\000o\000f\000t\000"
+ * <p>
+ * This class can be used to decode those strings.
+ * <p>
+ * See TIKA-1678.  Many thanks to Andrew Jackson for raising this issue
+ * and Tilman Hausherr for the solution.
+ * <p>
+ * As of this writing, we are only handling strings that start with
+ * an encoded BOM.  Andrew Jackson found a handful of other examples (e.g.
+ * this ISO-8859-7 string:
+ * "Microsoft Word - \\323\\365\\354\\354\\345\\364\\357\\367\\336
+ * \\364\\347\\362 PRAKSIS \\363\\364\\357")
+ * that we aren't currently handling.
+ */
+class PDFEncodedStringDecoder {
+
+    private static final String[] PDF_ENCODING_BOMS = {
+            "\\376\\377", //UTF-16BE
+            "\\377\\376", //UTF-16LE
+            "\\357\\273\\277"//UTF-8
+    };
+
+    /**
+     * Does this string contain an octal-encoded UTF BOM?
+     * Call this statically to determine if you should bother creating a new parser to parse it.
+     * @param s
+     * @return
+     */
+    static boolean shouldDecode(String s) {
+        if (s == null || s.length() < 8) {
+            return false;
+        }
+        for (String BOM : PDF_ENCODING_BOMS) {
+            if (s.startsWith(BOM)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * This assumes that {@link #shouldDecode(String)} has been called
+     * and has returned true.  If you run this on a non-octal encoded string,
+     * disaster will happen!
+     *
+     * @param value
+     * @return
+     */
+    String decode(String value) {
+        try {
+            byte[] bytes = new String("(" + value + ")").getBytes(ISO_8859_1);
+            InputStream is = new ByteArrayInputStream(bytes);
+            COSStringParser p = new COSStringParser(is);
+            String parsed = p.myParseCOSString();
+            if (parsed != null) {
+                return parsed;
+            }
+        } catch (IOException e) {
+            //oh well, we tried.
+        }
+        //just return value if something went wrong
+        return value;
+    }
+
+    class COSStringParser extends BaseParser {
+
+        COSStringParser(InputStream buffer) throws IOException {
+            super(buffer);
+        }
+
+        /**
+         *
+         * @return parsed string or null if something went wrong.
+         */
+        String myParseCOSString() {
+            try {
+                COSString cosString = parseCOSString();
+                if (cosString != null) {
+                    return cosString.getString();
+                }
+            } catch (IOException e) {
+            }
+            return null;
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,609 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Calendar;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.jempbox.xmp.XMPSchema;
+import org.apache.jempbox.xmp.XMPSchemaDublinCore;
+import org.apache.jempbox.xmp.pdfa.XMPSchemaPDFAId;
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSString;
+import org.apache.pdfbox.exceptions.CryptographyException;
+import org.apache.pdfbox.io.RandomAccess;
+import org.apache.pdfbox.io.RandomAccessBuffer;
+import org.apache.pdfbox.io.RandomAccessFile;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
+import org.apache.pdfbox.pdmodel.font.PDFont;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.AccessPermissions;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * PDF parser.
+ * <p/>
+ * This parser can process also encrypted PDF documents if the required
+ * password is given as a part of the input metadata associated with a
+ * document. If no password is given, then this parser will try decrypting
+ * the document using the empty password that's often used with PDFs. If
+ * the PDF contains any embedded documents (for example as part of a PDF
+ * package) then this parser will use the {@link EmbeddedDocumentExtractor}
+ * to handle them.
+ * <p/>
+ * As of Tika 1.6, it is possible to extract inline images with
+ * the {@link EmbeddedDocumentExtractor} as if they were regular
+ * attachments.  By default, this feature is turned off because of
+ * the potentially enormous number and size of inline images.  To
+ * turn this feature on, see
+ * {@link PDFParserConfig#setExtractInlineImages(boolean)}.
+ */
+public class PDFParser extends AbstractParser {
+
+
+    /**
+     * Metadata key for giving the document password to the parser.
+     *
+     * @since Apache Tika 0.5
+     * @deprecated Supply a {@link PasswordProvider} on the {@link ParseContext} instead
+     */
+    public static final String PASSWORD = "org.apache.tika.parser.pdf.password";
+    private static final MediaType MEDIA_TYPE = MediaType.application("pdf");
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -752276948656079347L;
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MEDIA_TYPE);
+    private PDFParserConfig defaultConfig = new PDFParserConfig();
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        PDDocument pdfDocument = null;
+        TemporaryResources tmp = new TemporaryResources();
+        //config from context, or default if not set via context
+        PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
+        String password = "";
+        try {
+            // PDFBox can process entirely in memory, or can use a temp file
+            //  for unpacked / processed resources
+            // Decide which to do based on if we're reading from a file or not already
+            TikaInputStream tstream = TikaInputStream.cast(stream);
+            password = getPassword(metadata, context);
+            if (tstream != null && tstream.hasFile()) {
+                // File based, take that as a cue to use a temporary file
+                RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
+                if (localConfig.getUseNonSequentialParser() == true) {
+                    pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile, password);
+                } else {
+                    pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true);
+                }
+            } else {
+                // Go for the normal, stream based in-memory parsing
+                if (localConfig.getUseNonSequentialParser() == true) {
+                    pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), new RandomAccessBuffer(), password);
+                } else {
+                    pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
+                }
+            }
+            metadata.set("pdf:encrypted", Boolean.toString(pdfDocument.isEncrypted()));
+
+            //if using the classic parser and the doc is encrypted, we must manually decrypt
+            if (!localConfig.getUseNonSequentialParser() && pdfDocument.isEncrypted()) {
+                pdfDocument.decrypt(password);
+            }
+
+            metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
+            extractMetadata(pdfDocument, metadata);
+
+            AccessChecker checker = localConfig.getAccessChecker();
+            checker.check(metadata);
+            if (handler != null) {
+                PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
+            }
+
+        } catch (CryptographyException e) {
+            //seq parser throws CryptographyException for bad password
+            throw new EncryptedDocumentException(e);
+        } catch (IOException e) {
+            //nonseq parser throws IOException for bad password
+            //At the Tika level, we want the same exception to be thrown
+            if (e.getMessage() != null &&
+                    e.getMessage().contains("Error (CryptographyException)")) {
+                metadata.set("pdf:encrypted", Boolean.toString(true));
+                throw new EncryptedDocumentException(e);
+            }
+            //rethrow any other IOExceptions
+            throw e;
+        } finally {
+            if (pdfDocument != null) {
+                pdfDocument.close();
+            }
+            tmp.dispose();
+            //TODO: once we migrate to PDFBox 2.0, remove this (PDFBOX-2200)
+            PDFont.clearResources();
+        }
+    }
+
+    private String getPassword(Metadata metadata, ParseContext context) {
+        String password = null;
+
+        // Did they supply a new style Password Provider?
+        PasswordProvider passwordProvider = context.get(PasswordProvider.class);
+        if (passwordProvider != null) {
+            password = passwordProvider.getPassword(metadata);
+        }
+
+        // Fall back on the old style metadata if set
+        if (password == null && metadata.get(PASSWORD) != null) {
+            password = metadata.get(PASSWORD);
+        }
+
+        // If no password is given, use an empty string as the default
+        if (password == null) {
+            password = "";
+        }
+        return password;
+    }
+
+
+    private void extractMetadata(PDDocument document, Metadata metadata)
+            throws TikaException {
+
+        //first extract AccessPermissions
+        AccessPermission ap = document.getCurrentAccessPermission();
+        metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
+                Boolean.toString(ap.canExtractForAccessibility()));
+        metadata.set(AccessPermissions.EXTRACT_CONTENT,
+                Boolean.toString(ap.canExtractContent()));
+        metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT,
+                Boolean.toString(ap.canAssembleDocument()));
+        metadata.set(AccessPermissions.FILL_IN_FORM,
+                Boolean.toString(ap.canFillInForm()));
+        metadata.set(AccessPermissions.CAN_MODIFY,
+                Boolean.toString(ap.canModify()));
+        metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS,
+                Boolean.toString(ap.canModifyAnnotations()));
+        metadata.set(AccessPermissions.CAN_PRINT,
+                Boolean.toString(ap.canPrint()));
+        metadata.set(AccessPermissions.CAN_PRINT_DEGRADED,
+                Boolean.toString(ap.canPrintDegraded()));
+
+
+        //now go for the XMP stuff
+        org.apache.jempbox.xmp.XMPMetadata xmp = null;
+        XMPSchemaDublinCore dcSchema = null;
+        try {
+            if (document.getDocumentCatalog().getMetadata() != null) {
+                xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata();
+            }
+            if (xmp != null) {
+                dcSchema = xmp.getDublinCoreSchema();
+            }
+        } catch (IOException e) {
+            //swallow
+        }
+        PDDocumentInformation info = document.getDocumentInformation();
+        metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
+        extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
+        extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
+        extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
+        addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
+        addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
+        addMetadata(metadata, "producer", info.getProducer());
+        extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);
+
+        // TODO: Move to description in Tika 2.0
+        addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
+        addMetadata(metadata, "trapped", info.getTrapped());
+        try {
+            // TODO Remove these in Tika 2.0
+            addMetadata(metadata, "created", info.getCreationDate());
+            addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
+        } catch (IOException e) {
+            // Invalid date format, just ignore
+        }
+        try {
+            Calendar modified = info.getModificationDate();
+            addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
+            addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
+        } catch (IOException e) {
+            // Invalid date format, just ignore
+        }
+
+        // All remaining metadata is custom
+        // Copy this over as-is
+        List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate",
+                "Keywords", "Producer", "Subject", "Title", "Trapped");
+        for (COSName key : info.getDictionary().keySet()) {
+            String name = key.getName();
+            if (!handledMetadata.contains(name)) {
+                addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
+            }
+        }
+
+        //try to get the various versions
+        //Caveats:
+        //    there is currently a fair amount of redundancy
+        //    TikaCoreProperties.FORMAT can be multivalued
+        //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion        
+        metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion()));
+        metadata.add(TikaCoreProperties.FORMAT.getName(),
+                MEDIA_TYPE.toString() + "; version=" +
+                        Float.toString(document.getDocument().getVersion()));
+
+        try {
+            if (xmp != null) {
+                xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
+                XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
+                if (pdfaxmp != null) {
+                    if (pdfaxmp.getPart() != null) {
+                        metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart()));
+                    }
+                    if (pdfaxmp.getConformance() != null) {
+                        metadata.set("pdfaid:conformance", pdfaxmp.getConformance());
+                        String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
+                        metadata.set("pdfa:PDFVersion", version);
+                        metadata.add(TikaCoreProperties.FORMAT.getName(),
+                                MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
+                    }
+                }
+                // TODO WARN if this XMP version is inconsistent with document header version?          
+            }
+        } catch (IOException e) {
+            metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
+        }
+        //TODO: Let's try to move this into PDFBox.
+        //Attempt to determine Adobe extension level, if present:
+        COSDictionary root = document.getDocumentCatalog().getCOSDictionary();
+        COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
+        if (extensions != null) {
+            for (COSName extName : extensions.keySet()) {
+                // If it's an Adobe one, interpret it to determine the extension level:
+                if (extName.equals(COSName.getPDFName("ADBE"))) {
+                    COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
+                    if (adobeExt != null) {
+                        String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
+                        int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
+                        //-1 is sentinel value that something went wrong in getInt
+                        if (el != -1) {
+                            metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el);
+                            metadata.add(TikaCoreProperties.FORMAT.getName(),
+                                    MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\"");
+                        }
+                    }
+                } else {
+                    // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
+                    metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
+                }
+            }
+        }
+    }
+
+    /**
+     * Try to extract all multilingual items from the XMPSchema
+     * <p/>
+     * This relies on the property having a valid xmp getName()
+     * <p/>
+     * For now, this only extracts the first language if the property does not allow multiple values (see TIKA-1295)
+     *
+     * @param metadata
+     * @param property
+     * @param pdfBoxBaseline
+     * @param schema
+     */
+    private void extractMultilingualItems(Metadata metadata, Property property,
+                                          String pdfBoxBaseline, XMPSchema schema) {
+        //if schema is null, just go with pdfBoxBaseline
+        if (schema == null) {
+            if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
+                addMetadata(metadata, property, pdfBoxBaseline);
+            }
+            return;
+        }
+
+        for (String lang : schema.getLanguagePropertyLanguages(property.getName())) {
+            String value = schema.getLanguageProperty(property.getName(), lang);
+
+            if (value != null && value.length() > 0) {
+                //if you're going to add it below in the baseline addition, don't add it now
+                if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) {
+                    continue;
+                }
+                addMetadata(metadata, property, value);
+                if (!property.isMultiValuePermitted()) {
+                    return;
+                }
+            }
+        }
+
+        if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
+            //if we've already added something above and multivalue is not permitted
+            //return.
+            if (!property.isMultiValuePermitted()) {
+                if (metadata.get(property) != null) {
+                    return;
+                }
+            }
+            addMetadata(metadata, property, pdfBoxBaseline);
+        }
+    }
+
+
+    /**
+     * This tries to read a list from a particular property in
+     * XMPSchemaDublinCore.
+     * If it can't find the information, it falls back to the
+     * pdfboxBaseline.  The pdfboxBaseline should be the value
+     * that pdfbox returns from its PDDocumentInformation object
+     * (e.g. getAuthor()) This method is designed include the pdfboxBaseline,
+     * and it should not duplicate the pdfboxBaseline.
+     * <p/>
+     * Until PDFBOX-1803/TIKA-1233 are fixed, do not call this
+     * on dates!
+     * <p/>
+     * This relies on the property having a DublinCore compliant getName()
+     *
+     * @param property
+     * @param pdfBoxBaseline
+     * @param dc
+     * @param metadata
+     */
+    private void extractDublinCoreListItems(Metadata metadata, Property property,
+                                            String pdfBoxBaseline, XMPSchemaDublinCore dc) {
+        //if no dc, add baseline and return
+        if (dc == null) {
+            if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
+                addMetadata(metadata, property, pdfBoxBaseline);
+            }
+            return;
+        }
+        List<String> items = getXMPBagOrSeqList(dc, property.getName());
+        if (items == null) {
+            if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
+                addMetadata(metadata, property, pdfBoxBaseline);
+            }
+            return;
+        }
+        for (String item : items) {
+            if (pdfBoxBaseline != null && !item.equals(pdfBoxBaseline)) {
+                addMetadata(metadata, property, item);
+            }
+        }
+        //finally, add the baseline
+        if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
+            addMetadata(metadata, property, pdfBoxBaseline);
+        }
+    }
+
+    /**
+     * As of this writing, XMPSchema can contain bags or sequence lists
+     * for some attributes...despite standards documentation.
+     * JempBox expects one or the other for specific attributes.
+     * Until more flexibility is added to JempBox, Tika will have to handle both.
+     *
+     * @param schema
+     * @param name
+     * @return list of values or null
+     */
+    private List<String> getXMPBagOrSeqList(XMPSchema schema, String name) {
+        List<String> ret = schema.getBagList(name);
+        if (ret == null) {
+            ret = schema.getSequenceList(name);
+        }
+        return ret;
+    }
+
+    private void addMetadata(Metadata metadata, Property property, String value) {
+        if (value != null) {
+            String decoded = decode(value);
+            if (property.isMultiValuePermitted() || metadata.get(property) == null) {
+                metadata.add(property, decoded);
+            }
+            //silently skip adding property that already exists if multiple values are not permitted
+        }
+    }
+
+    private void addMetadata(Metadata metadata, String name, String value) {
+        if (value != null) {
+            metadata.add(name, decode(value));
+        }
+    }
+
+    private String decode(String value) {
+        if (PDFEncodedStringDecoder.shouldDecode(value)) {
+            PDFEncodedStringDecoder d = new PDFEncodedStringDecoder();
+            return d.decode(value);
+        }
+        return value;
+    }
+
+    private void addMetadata(Metadata metadata, String name, Calendar value) {
+        if (value != null) {
+            metadata.set(name, value.getTime().toString());
+        }
+    }
+
+    private void addMetadata(Metadata metadata, Property property, Calendar value) {
+        if (value != null) {
+            metadata.set(property, value.getTime());
+        }
+    }
+
+    /**
+     * Used when processing custom metadata entries, as PDFBox won't do
+     * the conversion for us in the way it does for the standard ones
+     */
+    private void addMetadata(Metadata metadata, String name, COSBase value) {
+        if (value instanceof COSArray) {
+            for (Object v : ((COSArray) value).toList()) {
+                addMetadata(metadata, name, ((COSBase) v));
+            }
+        } else if (value instanceof COSString) {
+            addMetadata(metadata, name, ((COSString) value).getString());
+        }
+        // Avoid calling COSDictionary#toString, since it can lead to infinite
+        // recursion. See TIKA-1038 and PDFBOX-1835.
+        else if (value != null && !(value instanceof COSDictionary)) {
+            addMetadata(metadata, name, value.toString());
+        }
+    }
+
+    public PDFParserConfig getPDFParserConfig() {
+        return defaultConfig;
+    }
+
+    public void setPDFParserConfig(PDFParserConfig config) {
+        this.defaultConfig = config;
+    }
+
+    /**
+     * @see #setUseNonSequentialParser(boolean)
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
+    public boolean getUseNonSequentialParser() {
+        return defaultConfig.getUseNonSequentialParser();
+    }
+
+    /**
+     * If true, the parser will use the NonSequentialParser.  This may
+     * be faster than the full doc parser.
+     * If false (default), this will use the full doc parser.
+     *
+     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
+     */
+    public void setUseNonSequentialParser(boolean v) {
+        defaultConfig.setUseNonSequentialParser(v);
+    }
+
+    /**
+     * @see #setEnableAutoSpace(boolean)
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
+    public boolean getEnableAutoSpace() {
+        return defaultConfig.getEnableAutoSpace();
+    }
+
+    /**
+     * If true (the default), the parser should estimate
+     * where spaces should be inserted between words.  For
+     * many PDFs this is necessary as they do not include
+     * explicit whitespace characters.
+     *
+     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
+     */
+    public void setEnableAutoSpace(boolean v) {
+        defaultConfig.setEnableAutoSpace(v);
+    }
+
+    /**
+     * If true, text in annotations will be extracted.
+     *
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
+    public boolean getExtractAnnotationText() {
+        return defaultConfig.getExtractAnnotationText();
+    }
+
+    /**
+     * If true (the default), text in annotations will be
+     * extracted.
+     *
+     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
+     */
+    public void setExtractAnnotationText(boolean v) {
+        defaultConfig.setExtractAnnotationText(v);
+    }
+
+    /**
+     * @see #setSuppressDuplicateOverlappingText(boolean)
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
+    public boolean getSuppressDuplicateOverlappingText() {
+        return defaultConfig.getSuppressDuplicateOverlappingText();
+    }
+
+    /**
+     * If true, the parser should try to remove duplicated
+     * text over the same region.  This is needed for some
+     * PDFs that achieve bolding by re-writing the same
+     * text in the same area.  Note that this can
+     * slow down extraction substantially (PDFBOX-956) and
+     * sometimes remove characters that were not in fact
+     * duplicated (PDFBOX-1155).  By default this is disabled.
+     *
+     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
+     */
+    public void setSuppressDuplicateOverlappingText(boolean v) {
+        defaultConfig.setSuppressDuplicateOverlappingText(v);
+    }
+
+    /**
+     * @see #setSortByPosition(boolean)
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
+    public boolean getSortByPosition() {
+        return defaultConfig.getSortByPosition();
+    }
+
+    /**
+     * If true, sort text tokens by their x/y position
+     * before extracting text.  This may be necessary for
+     * some PDFs (if the text tokens are not rendered "in
+     * order"), while for other PDFs it can produce the
+     * wrong result (for example if there are 2 columns,
+     * the text will be interleaved).  Default is false.
+     *
+     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
+     */
+    public void setSortByPosition(boolean v) {
+        defaultConfig.setSortByPosition(v);
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,469 @@
+package org.apache.tika.parser.pdf;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.Locale;
+import java.util.Properties;
+
+import org.apache.pdfbox.util.PDFTextStripper;
+
+/**
+ * Config for PDFParser.
+ * <p/>
+ * This allows parameters to be set programmatically:
+ * <ol>
+ * <li>Calls to PDFParser, i.e. parser.getPDFParserConfig().setEnableAutoSpace() (as before)</li>
+ * <li>Constructor of PDFParser</li>
+ * <li>Passing to PDFParser through a ParseContext: context.set(PDFParserConfig.class, config);</li>
+ * </ol>
+ * <p/>
+ * Parameters can also be set by modifying the PDFParserConfig.properties file,
+ * which lives in the expected places, in trunk:
+ * tika-parsers/src/main/resources/org/apache/tika/parser/pdf
+ * <p/>
+ * Or, in tika-app-x.x.jar or tika-parsers-x.x.jar:
+ * org/apache/tika/parser/pdf
+ */
+public class PDFParserConfig implements Serializable {
+
+    private static final long serialVersionUID = 6492570218190936986L;
+
+    // True if we let PDFBox "guess" where spaces should go:
+    private boolean enableAutoSpace = true;
+
+    // True if we let PDFBox remove duplicate overlapping text:
+    private boolean suppressDuplicateOverlappingText;
+
+    // True if we extract annotation text ourselves
+    // (workaround for PDFBOX-1143):
+    private boolean extractAnnotationText = true;
+
+    // True if we should sort text tokens by position
+    // (necessary for some PDFs, but messes up other PDFs):
+    private boolean sortByPosition = false;
+
+    //True if we should use PDFBox's NonSequentialParser
+    private boolean useNonSequentialParser = false;
+
+    //True if acroform content should be extracted
+    private boolean extractAcroFormContent = true;
+
+    //True if inline PDXImage objects should be extracted
+    private boolean extractInlineImages = false;
+
+    //True if inline images (as identified by their object id within
+    //a pdf file) should only be extracted once.
+    private boolean extractUniqueInlineImagesOnly = true;
+
+    //The character width-based tolerance value used to estimate where spaces in text should be added
+    private Float averageCharTolerance;
+
+    //The space width-based tolerance value used to estimate where spaces in text should be added
+    private Float spacingTolerance;
+
+    private AccessChecker accessChecker;
+
+    public PDFParserConfig() {
+        init(this.getClass().getResourceAsStream("PDFParser.properties"));
+    }
+
+    /**
+     * Loads properties from InputStream and then tries to close InputStream.
+     * If there is an IOException, this silently swallows the exception
+     * and goes back to the default.
+     *
+     * @param is
+     */
+    public PDFParserConfig(InputStream is) {
+        init(is);
+    }
+
+    //initializes object and then tries to close inputstream
+    private void init(InputStream is) {
+
+        if (is == null) {
+            return;
+        }
+        Properties props = new Properties();
+        try {
+            props.load(is);
+        } catch (IOException e) {
+        } finally {
+            if (is != null) {
+                try {
+                    is.close();
+                } catch (IOException e) {
+                    //swallow
+                }
+            }
+        }
+        setEnableAutoSpace(
+                getProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace()));
+        setSuppressDuplicateOverlappingText(
+                getProp(props.getProperty("suppressDuplicateOverlappingText"),
+                        getSuppressDuplicateOverlappingText()));
+        setExtractAnnotationText(
+                getProp(props.getProperty("extractAnnotationText"),
+                        getExtractAnnotationText()));
+        setSortByPosition(
+                getProp(props.getProperty("sortByPosition"),
+                        getSortByPosition()));
+        setUseNonSequentialParser(
+                getProp(props.getProperty("useNonSequentialParser"),
+                        getUseNonSequentialParser()));
+        setExtractAcroFormContent(
+                getProp(props.getProperty("extractAcroFormContent"),
+                        getExtractAcroFormContent()));
+        setExtractInlineImages(
+                getProp(props.getProperty("extractInlineImages"),
+                        getExtractInlineImages()));
+        setExtractUniqueInlineImagesOnly(
+                getProp(props.getProperty("extractUniqueInlineImagesOnly"),
+                        getExtractUniqueInlineImagesOnly()));
+
+        boolean checkExtractAccessPermission = getProp(props.getProperty("checkExtractAccessPermission"), false);
+        boolean allowExtractionForAccessibility = getProp(props.getProperty("allowExtractionForAccessibility"), true);
+
+        if (checkExtractAccessPermission == false) {
+            //silently ignore the crazy configuration of checkExtractAccessPermission = false,
+            //but allowExtractionForAccessibility=false
+            accessChecker = new AccessChecker();
+        } else {
+            accessChecker = new AccessChecker(allowExtractionForAccessibility);
+        }
+    }
+
+    /**
+     * Configures the given pdf2XHTML.
+     *
+     * @param pdf2XHTML
+     */
+    public void configure(PDF2XHTML pdf2XHTML) {
+        pdf2XHTML.setForceParsing(true);
+        pdf2XHTML.setSortByPosition(getSortByPosition());
+        if (getEnableAutoSpace()) {
+            pdf2XHTML.setWordSeparator(" ");
+        } else {
+            pdf2XHTML.setWordSeparator("");
+        }
+        if (getAverageCharTolerance() != null) {
+            pdf2XHTML.setAverageCharTolerance(getAverageCharTolerance());
+        }
+        if (getSpacingTolerance() != null) {
+            pdf2XHTML.setSpacingTolerance(getSpacingTolerance());
+        }
+        pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
+    }
+
+    /**
+     * @see #setExtractAcroFormContent(boolean)
+     */
+    public boolean getExtractAcroFormContent() {
+        return extractAcroFormContent;
+    }
+
+    /**
+     * If true (the default), extract content from AcroForms
+     * at the end of the document.
+     *
+     * @param extractAcroFormContent
+     */
+    public void setExtractAcroFormContent(boolean extractAcroFormContent) {
+        this.extractAcroFormContent = extractAcroFormContent;
+
+    }
+
+    /**
+     * @see #setExtractInlineImages(boolean)
+     */
+    public boolean getExtractInlineImages() {
+        return extractInlineImages;
+    }
+
+    /**
+     * If true, extract inline embedded OBXImages.
+     * <b>Beware:</b> some PDF documents of modest size (~4MB) can contain
+     * thousands of embedded images totaling > 2.5 GB.  Also, at least as of PDFBox 1.8.5,
+     * there can be surprisingly large memory consumption and/or out of memory errors.
+     * Set to <code>true</code> with caution.
+     * <p/>
+     * The default is <code>false</code>.
+     * <p/>
+     * See also: {@see #setExtractUniqueInlineImagesOnly(boolean)};
+     *
+     * @param extractInlineImages
+     */
+    public void setExtractInlineImages(boolean extractInlineImages) {
+        this.extractInlineImages = extractInlineImages;
+    }
+
+    /**
+     * @see #setExtractUniqueInlineImagesOnly(boolean)
+     */
+    public boolean getExtractUniqueInlineImagesOnly() {
+        return extractUniqueInlineImagesOnly;
+    }
+
+    /**
+     * Multiple pages within a PDF file might refer to the same underlying image.
+     * If {@link #extractUniqueInlineImagesOnly} is set to <code>false</code>, the
+     * parser will call the EmbeddedExtractor each time the image appears on a page.
+     * This might be desired for some use cases.  However, to avoid duplication of
+     * extracted images, set this to <code>true</code>.  The default is <code>true</code>.
+     * <p/>
+     * Note that uniqueness is determined only by the underlying PDF COSObject id, not by
+     * file hash or similar equality metric.
+     * If the PDF actually contains multiple copies of the same image
+     * -- all with different object ids -- then all images will be extracted.
+     * <p/>
+     * For this parameter to have any effect, {@link #extractInlineImages} must be
+     * set to <code>true</code>.
+     * <p>
+     * Because of TIKA-1742 -- to avoid infinite recursion -- no matter the setting
+     * of this parameter, the extractor will only pull out one copy of each image per
+     * page.  This parameter tries to capture uniqueness across the entire document.
+     *
+     * @param extractUniqueInlineImagesOnly
+     */
+    public void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) {
+        this.extractUniqueInlineImagesOnly = extractUniqueInlineImagesOnly;
+
+    }
+
+    /**
+     * @see #setEnableAutoSpace(boolean)
+     */
+    public boolean getEnableAutoSpace() {
+        return enableAutoSpace;
+    }
+
+    /**
+     * If true (the default), the parser should estimate
+     * where spaces should be inserted between words.  For
+     * many PDFs this is necessary as they do not include
+     * explicit whitespace characters.
+     */
+    public void setEnableAutoSpace(boolean enableAutoSpace) {
+        this.enableAutoSpace = enableAutoSpace;
+    }
+
+    /**
+     * @see #setSuppressDuplicateOverlappingText(boolean)
+     */
+    public boolean getSuppressDuplicateOverlappingText() {
+        return suppressDuplicateOverlappingText;
+    }
+
+    /**
+     * If true, the parser should try to remove duplicated
+     * text over the same region.  This is needed for some
+     * PDFs that achieve bolding by re-writing the same
+     * text in the same area.  Note that this can
+     * slow down extraction substantially (PDFBOX-956) and
+     * sometimes remove characters that were not in fact
+     * duplicated (PDFBOX-1155).  By default this is disabled.
+     */
+    public void setSuppressDuplicateOverlappingText(
+            boolean suppressDuplicateOverlappingText) {
+        this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingText;
+    }
+
+    /**
+     * @see #setExtractAnnotationText(boolean)
+     */
+    public boolean getExtractAnnotationText() {
+        return extractAnnotationText;
+    }
+
+    /**
+     * If true (the default), text in annotations will be
+     * extracted.
+     */
+    public void setExtractAnnotationText(boolean extractAnnotationText) {
+        this.extractAnnotationText = extractAnnotationText;
+    }
+
+    /**
+     * @see #setSortByPosition(boolean)
+     */
+    public boolean getSortByPosition() {
+        return sortByPosition;
+    }
+
+    /**
+     * If true, sort text tokens by their x/y position
+     * before extracting text.  This may be necessary for
+     * some PDFs (if the text tokens are not rendered "in
+     * order"), while for other PDFs it can produce the
+     * wrong result (for example if there are 2 columns,
+     * the text will be interleaved).  Default is false.
+     */
+    public void setSortByPosition(boolean sortByPosition) {
+        this.sortByPosition = sortByPosition;
+    }
+
+    /**
+     * @see #setUseNonSequentialParser(boolean)
+     */
+    public boolean getUseNonSequentialParser() {
+        return useNonSequentialParser;
+    }
+
+    /**
+     * If true, uses PDFBox's non-sequential parser.
+     * The non-sequential parser should be much faster than the traditional
+     * full doc parser.  However, until PDFBOX-XXX is fixed,
+     * the non-sequential parser fails
+     * to extract some document metadata.
+     * <p/>
+     * Default is false (use the traditional parser)
+     *
+     * @param useNonSequentialParser
+     */
+    public void setUseNonSequentialParser(boolean useNonSequentialParser) {
+        this.useNonSequentialParser = useNonSequentialParser;
+    }
+
+    /**
+     * @see #setAverageCharTolerance(Float)
+     */
+    public Float getAverageCharTolerance() {
+        return averageCharTolerance;
+    }
+
+    /**
+     * See {@link PDFTextStripper#setAverageCharTolerance(float)}
+     */
+    public void setAverageCharTolerance(Float averageCharTolerance) {
+        this.averageCharTolerance = averageCharTolerance;
+    }
+
+    /**
+     * @see #setSpacingTolerance(Float)
+     */
+    public Float getSpacingTolerance() {
+        return spacingTolerance;
+    }
+
+    /**
+     * See {@link PDFTextStripper#setSpacingTolerance(float)}
+     */
+    public void setSpacingTolerance(Float spacingTolerance) {
+        this.spacingTolerance = spacingTolerance;
+    }
+
+    public AccessChecker getAccessChecker() {
+        return accessChecker;
+    }
+
+    public void setAccessChecker(AccessChecker accessChecker) {
+        this.accessChecker = accessChecker;
+    }
+
+    private boolean getProp(String p, boolean defaultMissing) {
+        if (p == null) {
+            return defaultMissing;
+        }
+        if (p.toLowerCase(Locale.ROOT).equals("true")) {
+            return true;
+        } else if (p.toLowerCase(Locale.ROOT).equals("false")) {
+            return false;
+        } else {
+            return defaultMissing;
+        }
+    }
+
+    @Override
+    public int hashCode() {
+        final int prime = 31;
+        int result = 1;
+        result = prime
+                * result
+                + ((averageCharTolerance == null) ? 0 : averageCharTolerance
+                .hashCode());
+        result = prime * result + (enableAutoSpace ? 1231 : 1237);
+        result = prime * result + (extractAcroFormContent ? 1231 : 1237);
+        result = prime * result + (extractAnnotationText ? 1231 : 1237);
+        result = prime * result + (extractInlineImages ? 1231 : 1237);
+        result = prime * result + (extractUniqueInlineImagesOnly ? 1231 : 1237);
+        result = prime * result + (sortByPosition ? 1231 : 1237);
+        result = prime
+                * result
+                + ((spacingTolerance == null) ? 0 : spacingTolerance.hashCode());
+        result = prime * result
+                + (suppressDuplicateOverlappingText ? 1231 : 1237);
+        result = prime * result + (useNonSequentialParser ? 1231 : 1237);
+        return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj)
+            return true;
+        if (obj == null)
+            return false;
+        if (getClass() != obj.getClass())
+            return false;
+        PDFParserConfig other = (PDFParserConfig) obj;
+        if (averageCharTolerance == null) {
+            if (other.averageCharTolerance != null)
+                return false;
+        } else if (!averageCharTolerance.equals(other.averageCharTolerance))
+            return false;
+        if (enableAutoSpace != other.enableAutoSpace)
+            return false;
+        if (extractAcroFormContent != other.extractAcroFormContent)
+            return false;
+        if (extractAnnotationText != other.extractAnnotationText)
+            return false;
+        if (extractInlineImages != other.extractInlineImages)
+            return false;
+        if (extractUniqueInlineImagesOnly != other.extractUniqueInlineImagesOnly)
+            return false;
+        if (sortByPosition != other.sortByPosition)
+            return false;
+        if (spacingTolerance == null) {
+            if (other.spacingTolerance != null)
+                return false;
+        } else if (!spacingTolerance.equals(other.spacingTolerance))
+            return false;
+        if (suppressDuplicateOverlappingText != other.suppressDuplicateOverlappingText)
+            return false;
+        if (useNonSequentialParser != other.useNonSequentialParser)
+            return false;
+        return true;
+    }
+
+    @Override
+    public String toString() {
+        return "PDFParserConfig [enableAutoSpace=" + enableAutoSpace
+                + ", suppressDuplicateOverlappingText="
+                + suppressDuplicateOverlappingText + ", extractAnnotationText="
+                + extractAnnotationText + ", sortByPosition=" + sortByPosition
+                + ", useNonSequentialParser=" + useNonSequentialParser
+                + ", extractAcroFormContent=" + extractAcroFormContent
+                + ", extractInlineImages=" + extractInlineImages
+                + ", extractUniqueInlineImagesOnly="
+                + extractUniqueInlineImagesOnly + ", averageCharTolerance="
+                + averageCharTolerance + ", spacingTolerance="
+                + spacingTolerance + "]";
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sat Jan 16 18:23:01 2016
@@ -0,0 +1,17 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+org.apache.tika.parser.pdf.PDFParser

Added: tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties (added)
+++ tika/branches/2.x/tika-parser-modules/tika-pdf-parser-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties Sat Jan 16 18:23:01 2016
@@ -0,0 +1,25 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+enableAutospace true
+extractAnnotationText true
+sortByPosition	false
+suppressDuplicateOverlappingText	false
+useNonSequentialParser	false
+extractAcroFormContent	true
+extractInlineImages false
+extractUniqueInlineImagesOnly true
+checkExtractAccessPermission false
+allowExtractionForAccessibility true