You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/02/13 17:17:26 UTC

[tika] branch master updated: TIKA-3026 -- initial capability to extract text and markup (if it exists) in PDFs

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 45e60a2  TIKA-3026 -- initial capability to extract text and markup (if it exists) in PDFs
45e60a2 is described below

commit 45e60a2a8285f49bd9d9f3bdde172b23d0f067a0
Author: tallison <ta...@apache.org>
AuthorDate: Thu Feb 13 12:14:31 2020 -0500

    TIKA-3026 -- initial capability to extract text and markup (if it exists) in PDFs
---
 .../main/java/org/apache/tika/metadata/PDF.java    |   2 +
 .../java/org/apache/tika/parser/pdf/PDF2XHTML.java |   4 +-
 .../tika/parser/pdf/PDFMarkedContent2XHTML.java    | 589 +++++++++++++++++++++
 .../java/org/apache/tika/parser/pdf/PDFParser.java |  35 +-
 .../apache/tika/parser/pdf/PDFParserConfig.java    |  25 +-
 .../parser/pdf/PDFMarkedContent2XHTMLTest.java     |  81 +++
 6 files changed, 731 insertions(+), 5 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index d9a6213..f129f84 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -90,4 +90,6 @@ public interface PDF {
      * Has > 0 AcroForm fields
      */
     Property HAS_ACROFORM_FIELDS = Property.internalBoolean(PDF_PREFIX+"hasAcroFormFields");
+
+    Property HAS_MARKED_CONTENT = Property.internalBoolean(PDF_PREFIX+"hasMarkedContent");
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 4ed0d90..8c2f3f2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -61,7 +61,7 @@ class PDF2XHTML extends AbstractPDF2XHTML {
      */
     private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
     private AtomicInteger inlineImageCounter = new AtomicInteger(0);
-    private PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
+    PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
                       PDFParserConfig config)
             throws IOException {
         super(document, handler, context, metadata, config);
@@ -146,7 +146,7 @@ class PDF2XHTML extends AbstractPDF2XHTML {
         }
     }
 
-    private void extractImages(PDPage page) throws SAXException, IOException {
+    void extractImages(PDPage page) throws SAXException, IOException {
         if (config.getExtractInlineImages() == false) {
             return;
         }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
new file mode 100644
index 0000000..9f764f9
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
@@ -0,0 +1,589 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSInteger;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDPageTree;
+import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
+import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
+import org.apache.pdfbox.text.PDFMarkedContentExtractor;
+import org.apache.pdfbox.text.TextPosition;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOExceptionWithCause;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+
+/**
+ * <p>This was added in Tika 1.24 as an alpha version of a text extractor
+ * that builds the text from the marked text tree and includes/normalizes
+ * some of the structural tags.
+ * </p>
+ *
+ * @since 1.24
+ */
+
+public class PDFMarkedContent2XHTML extends PDF2XHTML {
+
+    private static final int MAX_RECURSION_DEPTH = 1000;
+    private static final String DIV = "div";
+    private static final Map<String, HtmlTag> COMMON_TAG_MAP = new HashMap<>();
+
+    static {
+        //code requires these to be all lower case
+        COMMON_TAG_MAP.put("document", new HtmlTag("body"));
+        COMMON_TAG_MAP.put("div", new HtmlTag("div"));
+        COMMON_TAG_MAP.put("p", new HtmlTag("p"));
+        COMMON_TAG_MAP.put("span", new HtmlTag("span"));
+        COMMON_TAG_MAP.put("table", new HtmlTag("table"));
+        COMMON_TAG_MAP.put("thead", new HtmlTag("thead"));
+        COMMON_TAG_MAP.put("tbody", new HtmlTag("tbody"));
+        COMMON_TAG_MAP.put("tr", new HtmlTag("tr"));
+        COMMON_TAG_MAP.put("th", new HtmlTag("th"));
+        COMMON_TAG_MAP.put("td", new HtmlTag("td"));//TODO -- convert to th if in thead?
+        COMMON_TAG_MAP.put("l", new HtmlTag("ul"));
+        COMMON_TAG_MAP.put("li", new HtmlTag("li"));
+        COMMON_TAG_MAP.put("h1", new HtmlTag("h1"));
+        COMMON_TAG_MAP.put("h2", new HtmlTag("h2"));
+        COMMON_TAG_MAP.put("h3", new HtmlTag("h3"));
+        COMMON_TAG_MAP.put("h4", new HtmlTag("h4"));
+        COMMON_TAG_MAP.put("h5", new HtmlTag("h5"));
+        COMMON_TAG_MAP.put("h6", new HtmlTag("h6"));
+    }
+
+    //this stores state as we recurse through the structure tag tree
+    private State state = new State();
+
+    private PDFMarkedContent2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
+                                   PDFParserConfig config)
+            throws IOException {
+        super(document, handler, context, metadata, config);
+    }
+
+    /**
+     * Converts the given PDF document (and related metadata) to a stream
+     * of XHTML SAX events sent to the given content handler.
+     *
+     * @param pdDocument PDF document
+     * @param handler    SAX content handler
+     * @param metadata   PDF metadata
+     * @throws SAXException  if the content handler fails to process SAX events
+     * @throws TikaException if there was an exception outside of per page processing
+     */
+    public static void process(
+            PDDocument pdDocument, ContentHandler handler, ParseContext context, Metadata metadata,
+            PDFParserConfig config)
+            throws SAXException, TikaException {
+
+        PDFMarkedContent2XHTML pdfMarkedContent2XHTML = null;
+        try {
+            pdfMarkedContent2XHTML = new PDFMarkedContent2XHTML(pdDocument, handler, context, metadata, config);
+        } catch (IOException e) {
+            throw new TikaException("couldn't initialize PDFMarkedContent2XHTML", e);
+        }
+        try {
+            pdfMarkedContent2XHTML.writeText(pdDocument, new Writer() {
+                @Override
+                public void write(char[] cbuf, int off, int len) {
+                }
+
+                @Override
+                public void flush() {
+                }
+
+                @Override
+                public void close() {
+                }
+            });
+        } catch (IOException e) {
+            if (e.getCause() instanceof SAXException) {
+                throw (SAXException) e.getCause();
+            } else {
+                throw new TikaException("Unable to extract PDF content", e);
+            }
+        }
+        if (pdfMarkedContent2XHTML.exceptions.size() > 0) {
+            //throw the first
+            throw new TikaException("Unable to extract PDF content", pdfMarkedContent2XHTML.exceptions.get(0));
+        }
+    }
+
+    @Override
+    protected void processPages(PDPageTree pages) throws IOException {
+
+        //this is a 0-indexed list of object refs for each page
+        //we need this to map the mcids later...
+        //TODO: is there a better way of getting these/doing the mapping?
+
+        List<ObjectRef> pageRefs = new ArrayList<>();
+        //STEP 1: get the page refs
+        findPages(pdDocument.getPages().getCOSObject().getItem(COSName.KIDS), pageRefs);
+        //confirm the right number of pages was found
+        if (pageRefs.size() != pdDocument.getNumberOfPages()) {
+            throw new IOExceptionWithCause(
+                    new TikaException("Couldn't find the right number of page refs ("
+                            + pageRefs.size() + ") for pages (" +
+                            pdDocument.getNumberOfPages() + ")"));
+        }
+
+        PDStructureTreeRoot structureTreeRoot = pdDocument.getDocumentCatalog().getStructureTreeRoot();
+
+        //STEP 2: load the roleMap
+        Map<String, HtmlTag> roleMap = loadRoleMap(structureTreeRoot.getRoleMap());
+
+        //STEP 3: load all of the text, mapped to MCIDs
+        Map<MCID, String> paragraphs = loadTextByMCID(pageRefs);
+
+        //STEP 4: now recurse the the structure tree root and output the structure
+        //and the text bits from paragraphs
+
+        try {
+            recurse(structureTreeRoot.getK(), null, 0, paragraphs, roleMap);
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(e);
+        }
+
+        //STEP 5: handle all the potentially unprocessed bits
+        try {
+            if (state.hrefAnchorBuilder.length() > 0) {
+                xhtml.startElement("p");
+                writeString(state.hrefAnchorBuilder.toString());
+                xhtml.endElement("p");
+            }
+            for (MCID mcid : paragraphs.keySet()) {
+                if (!state.processedMCIDs.contains(mcid)) {
+                    if (mcid.mcid > -1) {
+                        //TODO: LOG! piece of text that wasn't referenced  in the marked content tree
+                        // but should have been.  If mcid == -1, this was a known item not part of
+                        // content tree.
+                    }
+
+                    xhtml.startElement("p");
+                    writeString(paragraphs.get(mcid));
+                    xhtml.endElement("p");
+                }
+            }
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(e);
+        }
+        //Step 6: for now, iterate through the pages again and do all the other handling
+        //TODO: figure out when we're crossing page boundaries during the recursion
+        // step above and do the page by page processing then...rather than dumping this
+        // all here.
+        for (PDPage page : pdDocument.getPages()) {
+            startPage(page);
+            endPage(page);
+        }
+
+    }
+
+    private void recurse(COSBase kids, ObjectRef currentPageRef, int depth,
+                         Map<MCID, String> paragraphs, Map<String, HtmlTag> roleMap) throws IOException, SAXException {
+
+        if (depth > MAX_RECURSION_DEPTH) {
+            throw new IOExceptionWithCause(
+                    new TikaException("Exceeded max recursion depth "+MAX_RECURSION_DEPTH));
+        }
+
+        if (kids instanceof COSArray) {
+            for (COSBase k : ((COSArray) kids)) {
+                recurse(k, currentPageRef, depth, paragraphs, roleMap);
+            }
+        } else if (kids instanceof COSObject) {
+            COSBase cosType = ((COSObject)kids).getItem(COSName.TYPE);
+            if (cosType != null && cosType instanceof COSName) {
+                if ("OBJR".equals(((COSName)cosType).getName())) {
+                    recurse(((COSObject)kids).getDictionaryObject(COSName.OBJ),currentPageRef,
+                            depth+1, paragraphs, roleMap);
+                }
+            }
+
+            COSBase n = ((COSObject) kids).getItem(COSName.S);
+            String name = "";
+            if (n instanceof COSName) {
+                name = ((COSName) n).getName();
+            }
+            COSBase grandkids = ((COSObject) kids).getItem(COSName.K);
+            if (grandkids == null) {
+                return;
+            }
+            COSBase pageBase = ((COSObject) kids).getItem(COSName.PG);
+
+            if (pageBase != null && pageBase instanceof COSObject) {
+                currentPageRef = new ObjectRef(((COSObject) pageBase).getObjectNumber(),
+                        ((COSObject) pageBase).getGenerationNumber());
+            }
+
+            HtmlTag tag = getTag(name, roleMap);
+            boolean startedLink = false;
+            boolean ignoreTag = false;
+            if ("link".equals(tag.clazz)) {
+                state.inLink = true;
+                startedLink = true;
+            }
+            if (!state.inLink) {
+                //TODO: currently suppressing span and lbody...
+                // is this what we want to do?  What else should we suppress?
+                if ("span".equals(tag.tag)) {
+                    ignoreTag = true;
+                } else if ("lbody".equals(tag.clazz)) {
+                    ignoreTag = true;
+                }
+                if (!ignoreTag) {
+                    if (!StringUtils.isAllBlank(tag.clazz)) {
+                        xhtml.startElement(tag.tag, "class", tag.clazz);
+                    } else {
+                        xhtml.startElement(tag.tag);
+                    }
+                }
+            }
+
+            recurse(grandkids, currentPageRef, depth + 1, paragraphs, roleMap);
+            if (startedLink) {
+                writeLink();
+            }
+            if (!state.inLink && !startedLink && !ignoreTag) {
+                xhtml.endElement(tag.tag);
+            }
+        } else if (kids instanceof COSInteger) {
+            int mcidInt = ((COSInteger) kids).intValue();
+            MCID mcid = new MCID(currentPageRef, mcidInt);
+            if (paragraphs.containsKey(mcid)) {
+                if (state.inLink) {
+                    state.hrefAnchorBuilder.append(paragraphs.get(mcid));
+                } else {
+                    try {
+                        //if it isn't a uri, output this anyhow
+                        writeString(paragraphs.get(mcid));
+                    } catch (IOException e) {
+                        handleCatchableIOE(e);
+                    }
+                }
+                state.processedMCIDs.add(mcid);
+            } else {
+                //TODO: log can't find mcid
+            }
+        } else if (kids instanceof COSDictionary) {
+            //TODO: check for other types of dictionary?
+            COSDictionary dict = (COSDictionary) kids;
+            COSDictionary anchor = dict.getCOSDictionary(COSName.A);
+            //check for subtype /Link ?
+            //COSName subtype = obj.getCOSName(COSName.SUBTYPE);
+            if (anchor != null) {
+                state.uri = anchor.getString(COSName.URI);
+            } else {
+                if (dict.containsKey(COSName.K)) {
+                    recurse(dict.getDictionaryObject(COSName.K), currentPageRef, depth + 1, paragraphs, roleMap);
+                } else if (dict.containsKey(COSName.OBJ)) {
+                    recurse(dict.getDictionaryObject(COSName.OBJ), currentPageRef, depth + 1, paragraphs, roleMap);
+
+                }
+            }
+        } else {
+            //TODO: handle a different object?
+        }
+    }
+
+    private void writeLink() throws SAXException, IOException {
+        //This is only for uris, obv.
+        //If we want to catch within doc references (GOTO, we need to cache those in state.
+        //See testPDF_childAttachments.pdf for examples
+        if (! StringUtils.isAllBlank(state.uri)) {
+            xhtml.startElement("a", "href", state.uri);
+            xhtml.characters(state.hrefAnchorBuilder.toString());
+            xhtml.endElement("a");
+        } else {
+            try {
+                //if it isn't a uri, output this anyhow
+                writeString(state.hrefAnchorBuilder.toString());
+            } catch (IOException e) {
+                handleCatchableIOE(e);
+            }
+        }
+        state.hrefAnchorBuilder.setLength(0);
+        state.inLink = false;
+        state.uri = null;
+
+    }
+
+
+    private HtmlTag getTag(String name, Map<String, HtmlTag> roleMap) {
+        if (roleMap.containsKey(name)) {
+            return roleMap.get(name);
+        }
+        String lc = name.toLowerCase(Locale.US);
+        if (COMMON_TAG_MAP.containsKey(lc)) {
+            return COMMON_TAG_MAP.get(lc);
+        }
+        roleMap.put(name, new HtmlTag(DIV, name.toLowerCase(Locale.US)));
+        return roleMap.get(name);
+    }
+
+
+    private static Map<String, HtmlTag> loadRoleMap(Map<String, Object> roleMap) {
+        if (roleMap == null) {
+            return Collections.EMPTY_MAP;
+        }
+        Map<String, HtmlTag> tags = new HashMap<>();
+        for (Map.Entry<String, Object> e : roleMap.entrySet()) {
+            String k = e.getKey();
+            Object obj = e.getValue();
+            if (obj instanceof String) {
+                String v = (String) obj;
+                String lc = v.toLowerCase(Locale.US);
+                if (COMMON_TAG_MAP.containsValue(new HtmlTag(lc))) {
+                    tags.put(k, new HtmlTag(lc));
+                } else {
+                    tags.put(k, new HtmlTag(DIV, lc));
+                }
+            }
+        }
+        return tags;
+    }
+
+    private Map<MCID, String> loadTextByMCID(List<ObjectRef> pageRefs) throws IOException {
+        int pageCount = 1;
+        Map<MCID, String> paragraphs = new HashMap<>();
+        for (PDPage page : pdDocument.getPages()) {
+            ObjectRef pageRef = pageRefs.get(pageCount - 1);
+            PDFMarkedContentExtractor ex = new PDFMarkedContentExtractor();
+            try {
+                ex.processPage(page);
+            } catch (IOException e) {
+                handleCatchableIOE(e);
+                continue;
+            }
+            for (PDMarkedContent c : ex.getMarkedContents()) {
+                //TODO: at some point also handle
+                // 1. c.getActualText()
+                // 2. c.getExpandedForm()
+                // 3. c.getAlternateDescription()
+                // 4. c.getLanguage()
+
+                List<Object> objects = c.getContents();
+                StringBuilder sb = new StringBuilder();
+                //TODO: sort text positions? Figure out when to add/remove a newline and/or space?
+                for (Object o : objects) {
+                    if (o instanceof TextPosition) {
+                        String unicode = ((TextPosition) o).getUnicode();
+                        if (unicode != null) {
+                            sb.append(unicode);
+                        }
+                    }/*
+                    TODO: do we want to do anything with these?
+                    TODO: Are there other types of objects we need to handle here?
+                    else if (o instanceof PDImageXObject) {
+
+                    } else if (o instanceof PDTransparencyGroup) {
+
+                    } else if (o instanceof PDMarkedContent) {
+
+                    } else if (o instanceof PDFormXObject) {
+
+                    } else {
+                        throw new RuntimeException("can't handle "+o.getClass());
+                    }*/
+                }
+
+                int mcidInt = c.getMCID();
+                MCID mcid = new MCID(pageRef, mcidInt);
+                String p = sb.toString();
+                if (c.getTag().equals("P")) {
+                    p = p.trim();
+                }
+
+                if (mcidInt < 0) {
+                    //mcidInt == -1 for text bits that do not have an actual
+                    //mcid -- concatenate these bits
+                    if (paragraphs.containsKey(mcid)) {
+                        p = paragraphs.get(mcid) + "\n" + p;
+                    }
+                }
+
+                paragraphs.put(mcid, p);
+
+            }
+            pageCount++;
+        }
+        return paragraphs;
+    }
+
+    private static void findPages(COSBase kidsObj, List<ObjectRef> pageRefs) {
+        if (kidsObj == null) {
+            return;
+        }
+        if (kidsObj instanceof COSArray) {
+            for (COSBase kid : ((COSArray) kidsObj)) {
+                if (kid instanceof COSObject) {
+                    COSBase kidbase = ((COSObject) kid).getObject();
+                    if (kidbase instanceof COSDictionary) {
+                        COSDictionary dict = (COSDictionary) kidbase;
+                        if (dict.containsKey(COSName.TYPE) && COSName.PAGE.equals(dict.getCOSName(COSName.TYPE))) {
+                            pageRefs.add(new ObjectRef(((COSObject) kid).getObjectNumber(),
+                                    ((COSObject) kid).getGenerationNumber()));
+                            continue;
+                        }
+                        if (((COSDictionary) kidbase).containsKey(COSName.KIDS)) {
+                            findPages(((COSDictionary) kidbase).getItem(COSName.KIDS), pageRefs);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+
+    private static class State {
+        Set<MCID> processedMCIDs = new HashSet<>();
+        boolean inLink = false;
+        private StringBuilder hrefAnchorBuilder = new StringBuilder();
+        private String uri = null;
+        private int tdDepth = 0;
+        int tableDepth = 0;
+    }
+
+    private static class HtmlTag {
+        private final String tag;
+        private final String clazz;
+
+        HtmlTag() {
+            this("");
+        }
+
+        HtmlTag(String tag) {
+            this(tag, "");
+        }
+
+        HtmlTag(String tag, String clazz) {
+            this.tag = tag;
+            this.clazz = clazz;
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+
+            HtmlTag htmlTag = (HtmlTag) o;
+
+            if (tag != null ? !tag.equals(htmlTag.tag) : htmlTag.tag != null) return false;
+            return clazz != null ? clazz.equals(htmlTag.clazz) : htmlTag.clazz == null;
+        }
+
+        @Override
+        public int hashCode() {
+            int result = tag != null ? tag.hashCode() : 0;
+            result = 31 * result + (clazz != null ? clazz.hashCode() : 0);
+            return result;
+        }
+    }
+
+    private static class ObjectRef {
+        private final long objId;
+        private final int version;
+
+        public ObjectRef(long objId, int version) {
+            this.objId = objId;
+            this.version = version;
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+            ObjectRef objectRef = (ObjectRef) o;
+            return objId == objectRef.objId &&
+                    version == objectRef.version;
+        }
+
+        @Override
+        public int hashCode() {
+            return Objects.hash(objId, version);
+        }
+
+        @Override
+        public String toString() {
+            return "ObjectRef{" +
+                    "objId=" + objId +
+                    ", version=" + version +
+                    '}';
+        }
+    }
+
+    /**
+     * In PDF land, MCID are integers that should be unique _per page_.
+     * This class includes the object ref to the page and the mcid
+     * so that this should be a cross-document unique key to
+     * given content.
+     * <p>
+     * If the mcid integer == -1, that means that there is text on the page
+     * not assigned to any marked content.
+     */
+    private static class MCID {
+        //this is the object ref to the particular page
+        private final ObjectRef objectRef;
+        private final int mcid;
+
+        public MCID(ObjectRef objectRef, int mcid) {
+            this.objectRef = objectRef;
+            this.mcid = mcid;
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+            MCID mcid1 = (MCID) o;
+            return mcid == mcid1.mcid &&
+                    Objects.equals(objectRef, mcid1.objectRef);
+        }
+
+        @Override
+        public int hashCode() {
+            return Objects.hash(objectRef, mcid);
+        }
+
+        @Override
+        public String toString() {
+            return "MCID{" +
+                    "objectRef=" + objectRef +
+                    ", mcid=" + mcid +
+                    '}';
+        }
+    }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index a63754e..41644bf 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -24,16 +24,18 @@ import java.util.Arrays;
 import java.util.Calendar;
 import java.util.Collections;
 import java.util.List;
-import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 
 import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.io.MemoryUsageSetting;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
 import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
 import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
 import org.apache.tika.config.Field;
@@ -148,11 +150,15 @@ public class PDFParser extends AbstractParser implements Initializable {
             if (handler != null) {
                 boolean hasXFA = hasXFA(pdfDocument);
                 metadata.set(PDF.HAS_XFA, Boolean.toString(hasXFA));
+                boolean hasMarkedContent = hasMarkedContent(pdfDocument);
+                metadata.set(PDF.HAS_MARKED_CONTENT, Boolean.toString(hasMarkedContent));
                 if (shouldHandleXFAOnly(hasXFA, localConfig)) {
                     handleXFAOnly(pdfDocument, handler, metadata, context);
                 } else if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
                     metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
                     OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
+                } else if (hasMarkedContent && localConfig.getExtractMarkedContent()) {
+                    PDFMarkedContent2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
                 } else {
                     if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
                         metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
@@ -170,6 +176,28 @@ public class PDFParser extends AbstractParser implements Initializable {
         }
     }
 
+    private boolean hasMarkedContent(PDDocument pdDocument) {
+        PDStructureTreeRoot root = pdDocument.getDocumentCatalog().getStructureTreeRoot();
+        if (root == null) {
+            return false;
+        }
+        COSBase base = root.getK();
+        if (base == null) {
+            return false;
+        }
+        //TODO: are there other checks we need to perform?
+        if (base instanceof COSDictionary) {
+            if (((COSDictionary)base).keySet().size() > 0) {
+                return true;
+            }
+        } else if (base instanceof COSArray) {
+            if (((COSArray) base).size() > 0) {
+                return true;
+            }
+        }
+        return false;
+    }
+
     private String getPassword(Metadata metadata, ParseContext context) {
         String password = null;
 
@@ -506,6 +534,11 @@ public class PDFParser extends AbstractParser implements Initializable {
     }
 
     @Field
+    void setExtractMarkedContent(boolean extractMarkedContent) {
+        defaultConfig.setExtractMarkedContent(extractMarkedContent);
+    }
+
+    @Field
     void setInitializableProblemHander(String name) {
         if ("ignore".equals(name)) {
             setInitializableProblemHandler(InitializableProblemHandler.IGNORE);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 44324c2..178a5f8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -1,5 +1,3 @@
-package org.apache.tika.parser.pdf;
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,6 +14,7 @@ package org.apache.tika.parser.pdf;
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.tika.parser.pdf;
 
 import java.io.IOException;
 import java.io.InputStream;
@@ -110,6 +109,10 @@ public class PDFParserConfig implements Serializable {
     //a pdf file) should only be extracted once.
     private boolean extractUniqueInlineImagesOnly = true;
 
+    //Should the PDFParser _try_ to extract marked content/structure tags (backoff to regular
+    //text extraction if the given PDF doesn't have marked content)
+    private boolean extractMarkedContent = false;
+
     //The character width-based tolerance value used to estimate where spaces in text should be added
     private Float averageCharTolerance;
 
@@ -228,6 +231,8 @@ public class PDFParserConfig implements Serializable {
 
         setExtractActions(getBooleanProp(props.getProperty("extractActions"), false));
 
+        setExtractMarkedContent(getBooleanProp(props.getProperty("extractMarkedContent"), false));
+
         setSetKCMS(getBooleanProp(props.getProperty("setKCMS"), false));
 
         boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
@@ -246,6 +251,22 @@ public class PDFParserConfig implements Serializable {
     }
 
     /**
+     * If the PDF contains marked content, try to extract text and its marked structure.
+     * If the PDF does not contain marked content, backoff to the regular PDF2XHTML for
+     * text extraction.  As of 1.24, this is an "alpha" version.
+     *
+     * @param extractMarkedContent
+     * @since 1.24
+     */
+    public void setExtractMarkedContent(boolean extractMarkedContent) {
+        this.extractMarkedContent = extractMarkedContent;
+    }
+
+    public boolean getExtractMarkedContent() {
+        return extractMarkedContent;
+    }
+
+    /**
      * Configures the given pdf2XHTML.
      *
      * @param pdf2XHTML
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTMLTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTMLTest.java
new file mode 100644
index 0000000..ab30c16
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTMLTest.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+
+public class PDFMarkedContent2XHTMLTest extends TikaTest {
+
+    static ParseContext MARKUP_CONTEXT = new ParseContext();
+
+    @BeforeClass
+    public static void setUp() {
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractMarkedContent(true);
+
+        MARKUP_CONTEXT.set(PDFParserConfig.class, config);
+    }
+
+    @Test
+    public void testJournal() throws Exception {
+        String xml = getXML("testJournalParser.pdf", MARKUP_CONTEXT).xml;
+        assertContains("<h1>I. INTRODUCTION</h1>", xml);
+        assertContains("<table><tr>\t<td><p />", xml);
+        assertContains("</td>\t<td><p>NHG</p>", xml);
+        assertContains("</td>\t<td><p>STRING</p>", xml);
+    }
+
+    @Test
+    public void testVarious() throws Exception {
+        String xml = getXML("testPDFVarious.pdf", MARKUP_CONTEXT).xml;
+        assertContains("<div class=\"textbox\"><p>Here is a text box</p>", xml);
+        assertContains("<div class=\"footnote\"><p>1 This is a footnote.</p>", xml);
+        assertContains("<ul>\t<li>Bullet 1</li>", xml);
+        assertContains("<table><tr>\t<td><p>Row 1 Col 1</p>", xml);
+        assertContains("<p>Here is a citation:</p>", xml);
+        assertContains("a href=\"http://tika.apache.org/\">This is a hyperlink</a>", xml);
+        assertContains("This is the header text.", xml);
+        assertContains("This is the footer text.", xml);
+    }
+
+    @Test
+    public void testChildAttachments() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testPDF_childAttachments.pdf", MARKUP_CONTEXT);
+
+        //make sure that embedded docs are still getting extracted
+        assertEquals(3, metadataList.size());
+
+        String xml = metadataList.get(0).get(RecursiveParserWrapperHandler.TIKA_CONTENT);
+        System.out.println(xml);
+        //the point here is that in the annotations (that we were grabbing by the classic PDF2XHTML),
+        //the <a> content is identical to the href.  Here, they are not, which we only get from
+        //marked up content...victory!!!
+        assertContains("<a href=\"http://www.irs.gov\">IRS.gov</a>", xml);
+        assertContains("<a href=\"http://www.irs.gov/pub15\">www.irs.gov/pub15</a>", xml);
+    }
+
+}
\ No newline at end of file