You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/02/13 17:17:26 UTC
[tika] branch master updated: TIKA-3026 -- initial capability to
extract text and markup (if it exists) in PDFs
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 45e60a2 TIKA-3026 -- initial capability to extract text and markup (if it exists) in PDFs
45e60a2 is described below
commit 45e60a2a8285f49bd9d9f3bdde172b23d0f067a0
Author: tallison <ta...@apache.org>
AuthorDate: Thu Feb 13 12:14:31 2020 -0500
TIKA-3026 -- initial capability to extract text and markup (if it exists) in PDFs
---
.../main/java/org/apache/tika/metadata/PDF.java | 2 +
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 4 +-
.../tika/parser/pdf/PDFMarkedContent2XHTML.java | 589 +++++++++++++++++++++
.../java/org/apache/tika/parser/pdf/PDFParser.java | 35 +-
.../apache/tika/parser/pdf/PDFParserConfig.java | 25 +-
.../parser/pdf/PDFMarkedContent2XHTMLTest.java | 81 +++
6 files changed, 731 insertions(+), 5 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index d9a6213..f129f84 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -90,4 +90,6 @@ public interface PDF {
* Has > 0 AcroForm fields
*/
Property HAS_ACROFORM_FIELDS = Property.internalBoolean(PDF_PREFIX+"hasAcroFormFields");
+
+ Property HAS_MARKED_CONTENT = Property.internalBoolean(PDF_PREFIX+"hasMarkedContent");
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 4ed0d90..8c2f3f2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -61,7 +61,7 @@ class PDF2XHTML extends AbstractPDF2XHTML {
*/
private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
private AtomicInteger inlineImageCounter = new AtomicInteger(0);
- private PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
+ PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
PDFParserConfig config)
throws IOException {
super(document, handler, context, metadata, config);
@@ -146,7 +146,7 @@ class PDF2XHTML extends AbstractPDF2XHTML {
}
}
- private void extractImages(PDPage page) throws SAXException, IOException {
+ void extractImages(PDPage page) throws SAXException, IOException {
if (config.getExtractInlineImages() == false) {
return;
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
new file mode 100644
index 0000000..9f764f9
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
@@ -0,0 +1,589 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSInteger;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDPageTree;
+import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
+import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
+import org.apache.pdfbox.text.PDFMarkedContentExtractor;
+import org.apache.pdfbox.text.TextPosition;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOExceptionWithCause;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+
+/**
+ * <p>This was added in Tika 1.24 as an alpha version of a text extractor
+ * that builds the text from the marked text tree and includes/normalizes
+ * some of the structural tags.
+ * </p>
+ *
+ * @since 1.24
+ */
+
+public class PDFMarkedContent2XHTML extends PDF2XHTML {
+
+ private static final int MAX_RECURSION_DEPTH = 1000;
+ private static final String DIV = "div";
+ private static final Map<String, HtmlTag> COMMON_TAG_MAP = new HashMap<>();
+
+ static {
+ //code requires these to be all lower case
+ COMMON_TAG_MAP.put("document", new HtmlTag("body"));
+ COMMON_TAG_MAP.put("div", new HtmlTag("div"));
+ COMMON_TAG_MAP.put("p", new HtmlTag("p"));
+ COMMON_TAG_MAP.put("span", new HtmlTag("span"));
+ COMMON_TAG_MAP.put("table", new HtmlTag("table"));
+ COMMON_TAG_MAP.put("thead", new HtmlTag("thead"));
+ COMMON_TAG_MAP.put("tbody", new HtmlTag("tbody"));
+ COMMON_TAG_MAP.put("tr", new HtmlTag("tr"));
+ COMMON_TAG_MAP.put("th", new HtmlTag("th"));
+ COMMON_TAG_MAP.put("td", new HtmlTag("td"));//TODO -- convert to th if in thead?
+ COMMON_TAG_MAP.put("l", new HtmlTag("ul"));
+ COMMON_TAG_MAP.put("li", new HtmlTag("li"));
+ COMMON_TAG_MAP.put("h1", new HtmlTag("h1"));
+ COMMON_TAG_MAP.put("h2", new HtmlTag("h2"));
+ COMMON_TAG_MAP.put("h3", new HtmlTag("h3"));
+ COMMON_TAG_MAP.put("h4", new HtmlTag("h4"));
+ COMMON_TAG_MAP.put("h5", new HtmlTag("h5"));
+ COMMON_TAG_MAP.put("h6", new HtmlTag("h6"));
+ }
+
+ //this stores state as we recurse through the structure tag tree
+ private State state = new State();
+
+ private PDFMarkedContent2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
+ PDFParserConfig config)
+ throws IOException {
+ super(document, handler, context, metadata, config);
+ }
+
+ /**
+ * Converts the given PDF document (and related metadata) to a stream
+ * of XHTML SAX events sent to the given content handler.
+ *
+ * @param pdDocument PDF document
+ * @param handler SAX content handler
+ * @param metadata PDF metadata
+ * @throws SAXException if the content handler fails to process SAX events
+ * @throws TikaException if there was an exception outside of per page processing
+ */
+ public static void process(
+ PDDocument pdDocument, ContentHandler handler, ParseContext context, Metadata metadata,
+ PDFParserConfig config)
+ throws SAXException, TikaException {
+
+ PDFMarkedContent2XHTML pdfMarkedContent2XHTML = null;
+ try {
+ pdfMarkedContent2XHTML = new PDFMarkedContent2XHTML(pdDocument, handler, context, metadata, config);
+ } catch (IOException e) {
+ throw new TikaException("couldn't initialize PDFMarkedContent2XHTML", e);
+ }
+ try {
+ pdfMarkedContent2XHTML.writeText(pdDocument, new Writer() {
+ @Override
+ public void write(char[] cbuf, int off, int len) {
+ }
+
+ @Override
+ public void flush() {
+ }
+
+ @Override
+ public void close() {
+ }
+ });
+ } catch (IOException e) {
+ if (e.getCause() instanceof SAXException) {
+ throw (SAXException) e.getCause();
+ } else {
+ throw new TikaException("Unable to extract PDF content", e);
+ }
+ }
+ if (pdfMarkedContent2XHTML.exceptions.size() > 0) {
+ //throw the first
+ throw new TikaException("Unable to extract PDF content", pdfMarkedContent2XHTML.exceptions.get(0));
+ }
+ }
+
+ @Override
+ protected void processPages(PDPageTree pages) throws IOException {
+
+ //this is a 0-indexed list of object refs for each page
+ //we need this to map the mcids later...
+ //TODO: is there a better way of getting these/doing the mapping?
+
+ List<ObjectRef> pageRefs = new ArrayList<>();
+ //STEP 1: get the page refs
+ findPages(pdDocument.getPages().getCOSObject().getItem(COSName.KIDS), pageRefs);
+ //confirm the right number of pages was found
+ if (pageRefs.size() != pdDocument.getNumberOfPages()) {
+ throw new IOExceptionWithCause(
+ new TikaException("Couldn't find the right number of page refs ("
+ + pageRefs.size() + ") for pages (" +
+ pdDocument.getNumberOfPages() + ")"));
+ }
+
+ PDStructureTreeRoot structureTreeRoot = pdDocument.getDocumentCatalog().getStructureTreeRoot();
+
+ //STEP 2: load the roleMap
+ Map<String, HtmlTag> roleMap = loadRoleMap(structureTreeRoot.getRoleMap());
+
+ //STEP 3: load all of the text, mapped to MCIDs
+ Map<MCID, String> paragraphs = loadTextByMCID(pageRefs);
+
+ //STEP 4: now recurse the the structure tree root and output the structure
+ //and the text bits from paragraphs
+
+ try {
+ recurse(structureTreeRoot.getK(), null, 0, paragraphs, roleMap);
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause(e);
+ }
+
+ //STEP 5: handle all the potentially unprocessed bits
+ try {
+ if (state.hrefAnchorBuilder.length() > 0) {
+ xhtml.startElement("p");
+ writeString(state.hrefAnchorBuilder.toString());
+ xhtml.endElement("p");
+ }
+ for (MCID mcid : paragraphs.keySet()) {
+ if (!state.processedMCIDs.contains(mcid)) {
+ if (mcid.mcid > -1) {
+ //TODO: LOG! piece of text that wasn't referenced in the marked content tree
+ // but should have been. If mcid == -1, this was a known item not part of
+ // content tree.
+ }
+
+ xhtml.startElement("p");
+ writeString(paragraphs.get(mcid));
+ xhtml.endElement("p");
+ }
+ }
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause(e);
+ }
+ //Step 6: for now, iterate through the pages again and do all the other handling
+ //TODO: figure out when we're crossing page boundaries during the recursion
+ // step above and do the page by page processing then...rather than dumping this
+ // all here.
+ for (PDPage page : pdDocument.getPages()) {
+ startPage(page);
+ endPage(page);
+ }
+
+ }
+
+ private void recurse(COSBase kids, ObjectRef currentPageRef, int depth,
+ Map<MCID, String> paragraphs, Map<String, HtmlTag> roleMap) throws IOException, SAXException {
+
+ if (depth > MAX_RECURSION_DEPTH) {
+ throw new IOExceptionWithCause(
+ new TikaException("Exceeded max recursion depth "+MAX_RECURSION_DEPTH));
+ }
+
+ if (kids instanceof COSArray) {
+ for (COSBase k : ((COSArray) kids)) {
+ recurse(k, currentPageRef, depth, paragraphs, roleMap);
+ }
+ } else if (kids instanceof COSObject) {
+ COSBase cosType = ((COSObject)kids).getItem(COSName.TYPE);
+ if (cosType != null && cosType instanceof COSName) {
+ if ("OBJR".equals(((COSName)cosType).getName())) {
+ recurse(((COSObject)kids).getDictionaryObject(COSName.OBJ),currentPageRef,
+ depth+1, paragraphs, roleMap);
+ }
+ }
+
+ COSBase n = ((COSObject) kids).getItem(COSName.S);
+ String name = "";
+ if (n instanceof COSName) {
+ name = ((COSName) n).getName();
+ }
+ COSBase grandkids = ((COSObject) kids).getItem(COSName.K);
+ if (grandkids == null) {
+ return;
+ }
+ COSBase pageBase = ((COSObject) kids).getItem(COSName.PG);
+
+ if (pageBase != null && pageBase instanceof COSObject) {
+ currentPageRef = new ObjectRef(((COSObject) pageBase).getObjectNumber(),
+ ((COSObject) pageBase).getGenerationNumber());
+ }
+
+ HtmlTag tag = getTag(name, roleMap);
+ boolean startedLink = false;
+ boolean ignoreTag = false;
+ if ("link".equals(tag.clazz)) {
+ state.inLink = true;
+ startedLink = true;
+ }
+ if (!state.inLink) {
+ //TODO: currently suppressing span and lbody...
+ // is this what we want to do? What else should we suppress?
+ if ("span".equals(tag.tag)) {
+ ignoreTag = true;
+ } else if ("lbody".equals(tag.clazz)) {
+ ignoreTag = true;
+ }
+ if (!ignoreTag) {
+ if (!StringUtils.isAllBlank(tag.clazz)) {
+ xhtml.startElement(tag.tag, "class", tag.clazz);
+ } else {
+ xhtml.startElement(tag.tag);
+ }
+ }
+ }
+
+ recurse(grandkids, currentPageRef, depth + 1, paragraphs, roleMap);
+ if (startedLink) {
+ writeLink();
+ }
+ if (!state.inLink && !startedLink && !ignoreTag) {
+ xhtml.endElement(tag.tag);
+ }
+ } else if (kids instanceof COSInteger) {
+ int mcidInt = ((COSInteger) kids).intValue();
+ MCID mcid = new MCID(currentPageRef, mcidInt);
+ if (paragraphs.containsKey(mcid)) {
+ if (state.inLink) {
+ state.hrefAnchorBuilder.append(paragraphs.get(mcid));
+ } else {
+ try {
+ //if it isn't a uri, output this anyhow
+ writeString(paragraphs.get(mcid));
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+ }
+ state.processedMCIDs.add(mcid);
+ } else {
+ //TODO: log can't find mcid
+ }
+ } else if (kids instanceof COSDictionary) {
+ //TODO: check for other types of dictionary?
+ COSDictionary dict = (COSDictionary) kids;
+ COSDictionary anchor = dict.getCOSDictionary(COSName.A);
+ //check for subtype /Link ?
+ //COSName subtype = obj.getCOSName(COSName.SUBTYPE);
+ if (anchor != null) {
+ state.uri = anchor.getString(COSName.URI);
+ } else {
+ if (dict.containsKey(COSName.K)) {
+ recurse(dict.getDictionaryObject(COSName.K), currentPageRef, depth + 1, paragraphs, roleMap);
+ } else if (dict.containsKey(COSName.OBJ)) {
+ recurse(dict.getDictionaryObject(COSName.OBJ), currentPageRef, depth + 1, paragraphs, roleMap);
+
+ }
+ }
+ } else {
+ //TODO: handle a different object?
+ }
+ }
+
+ private void writeLink() throws SAXException, IOException {
+ //This is only for uris, obv.
+ //If we want to catch within doc references (GOTO, we need to cache those in state.
+ //See testPDF_childAttachments.pdf for examples
+ if (! StringUtils.isAllBlank(state.uri)) {
+ xhtml.startElement("a", "href", state.uri);
+ xhtml.characters(state.hrefAnchorBuilder.toString());
+ xhtml.endElement("a");
+ } else {
+ try {
+ //if it isn't a uri, output this anyhow
+ writeString(state.hrefAnchorBuilder.toString());
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+ }
+ state.hrefAnchorBuilder.setLength(0);
+ state.inLink = false;
+ state.uri = null;
+
+ }
+
+
+ private HtmlTag getTag(String name, Map<String, HtmlTag> roleMap) {
+ if (roleMap.containsKey(name)) {
+ return roleMap.get(name);
+ }
+ String lc = name.toLowerCase(Locale.US);
+ if (COMMON_TAG_MAP.containsKey(lc)) {
+ return COMMON_TAG_MAP.get(lc);
+ }
+ roleMap.put(name, new HtmlTag(DIV, name.toLowerCase(Locale.US)));
+ return roleMap.get(name);
+ }
+
+
+ private static Map<String, HtmlTag> loadRoleMap(Map<String, Object> roleMap) {
+ if (roleMap == null) {
+ return Collections.EMPTY_MAP;
+ }
+ Map<String, HtmlTag> tags = new HashMap<>();
+ for (Map.Entry<String, Object> e : roleMap.entrySet()) {
+ String k = e.getKey();
+ Object obj = e.getValue();
+ if (obj instanceof String) {
+ String v = (String) obj;
+ String lc = v.toLowerCase(Locale.US);
+ if (COMMON_TAG_MAP.containsValue(new HtmlTag(lc))) {
+ tags.put(k, new HtmlTag(lc));
+ } else {
+ tags.put(k, new HtmlTag(DIV, lc));
+ }
+ }
+ }
+ return tags;
+ }
+
+ private Map<MCID, String> loadTextByMCID(List<ObjectRef> pageRefs) throws IOException {
+ int pageCount = 1;
+ Map<MCID, String> paragraphs = new HashMap<>();
+ for (PDPage page : pdDocument.getPages()) {
+ ObjectRef pageRef = pageRefs.get(pageCount - 1);
+ PDFMarkedContentExtractor ex = new PDFMarkedContentExtractor();
+ try {
+ ex.processPage(page);
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ continue;
+ }
+ for (PDMarkedContent c : ex.getMarkedContents()) {
+ //TODO: at some point also handle
+ // 1. c.getActualText()
+ // 2. c.getExpandedForm()
+ // 3. c.getAlternateDescription()
+ // 4. c.getLanguage()
+
+ List<Object> objects = c.getContents();
+ StringBuilder sb = new StringBuilder();
+ //TODO: sort text positions? Figure out when to add/remove a newline and/or space?
+ for (Object o : objects) {
+ if (o instanceof TextPosition) {
+ String unicode = ((TextPosition) o).getUnicode();
+ if (unicode != null) {
+ sb.append(unicode);
+ }
+ }/*
+ TODO: do we want to do anything with these?
+ TODO: Are there other types of objects we need to handle here?
+ else if (o instanceof PDImageXObject) {
+
+ } else if (o instanceof PDTransparencyGroup) {
+
+ } else if (o instanceof PDMarkedContent) {
+
+ } else if (o instanceof PDFormXObject) {
+
+ } else {
+ throw new RuntimeException("can't handle "+o.getClass());
+ }*/
+ }
+
+ int mcidInt = c.getMCID();
+ MCID mcid = new MCID(pageRef, mcidInt);
+ String p = sb.toString();
+ if (c.getTag().equals("P")) {
+ p = p.trim();
+ }
+
+ if (mcidInt < 0) {
+ //mcidInt == -1 for text bits that do not have an actual
+ //mcid -- concatenate these bits
+ if (paragraphs.containsKey(mcid)) {
+ p = paragraphs.get(mcid) + "\n" + p;
+ }
+ }
+
+ paragraphs.put(mcid, p);
+
+ }
+ pageCount++;
+ }
+ return paragraphs;
+ }
+
+ private static void findPages(COSBase kidsObj, List<ObjectRef> pageRefs) {
+ if (kidsObj == null) {
+ return;
+ }
+ if (kidsObj instanceof COSArray) {
+ for (COSBase kid : ((COSArray) kidsObj)) {
+ if (kid instanceof COSObject) {
+ COSBase kidbase = ((COSObject) kid).getObject();
+ if (kidbase instanceof COSDictionary) {
+ COSDictionary dict = (COSDictionary) kidbase;
+ if (dict.containsKey(COSName.TYPE) && COSName.PAGE.equals(dict.getCOSName(COSName.TYPE))) {
+ pageRefs.add(new ObjectRef(((COSObject) kid).getObjectNumber(),
+ ((COSObject) kid).getGenerationNumber()));
+ continue;
+ }
+ if (((COSDictionary) kidbase).containsKey(COSName.KIDS)) {
+ findPages(((COSDictionary) kidbase).getItem(COSName.KIDS), pageRefs);
+ }
+ }
+ }
+ }
+ }
+ }
+
+
+ private static class State {
+ Set<MCID> processedMCIDs = new HashSet<>();
+ boolean inLink = false;
+ private StringBuilder hrefAnchorBuilder = new StringBuilder();
+ private String uri = null;
+ private int tdDepth = 0;
+ int tableDepth = 0;
+ }
+
+ private static class HtmlTag {
+ private final String tag;
+ private final String clazz;
+
+ HtmlTag() {
+ this("");
+ }
+
+ HtmlTag(String tag) {
+ this(tag, "");
+ }
+
+ HtmlTag(String tag, String clazz) {
+ this.tag = tag;
+ this.clazz = clazz;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ HtmlTag htmlTag = (HtmlTag) o;
+
+ if (tag != null ? !tag.equals(htmlTag.tag) : htmlTag.tag != null) return false;
+ return clazz != null ? clazz.equals(htmlTag.clazz) : htmlTag.clazz == null;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = tag != null ? tag.hashCode() : 0;
+ result = 31 * result + (clazz != null ? clazz.hashCode() : 0);
+ return result;
+ }
+ }
+
+ private static class ObjectRef {
+ private final long objId;
+ private final int version;
+
+ public ObjectRef(long objId, int version) {
+ this.objId = objId;
+ this.version = version;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ ObjectRef objectRef = (ObjectRef) o;
+ return objId == objectRef.objId &&
+ version == objectRef.version;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(objId, version);
+ }
+
+ @Override
+ public String toString() {
+ return "ObjectRef{" +
+ "objId=" + objId +
+ ", version=" + version +
+ '}';
+ }
+ }
+
+ /**
+ * In PDF land, MCID are integers that should be unique _per page_.
+ * This class includes the object ref to the page and the mcid
+ * so that this should be a cross-document unique key to
+ * given content.
+ * <p>
+ * If the mcid integer == -1, that means that there is text on the page
+ * not assigned to any marked content.
+ */
+ private static class MCID {
+ //this is the object ref to the particular page
+ private final ObjectRef objectRef;
+ private final int mcid;
+
+ public MCID(ObjectRef objectRef, int mcid) {
+ this.objectRef = objectRef;
+ this.mcid = mcid;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ MCID mcid1 = (MCID) o;
+ return mcid == mcid1.mcid &&
+ Objects.equals(objectRef, mcid1.objectRef);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(objectRef, mcid);
+ }
+
+ @Override
+ public String toString() {
+ return "MCID{" +
+ "objectRef=" + objectRef +
+ ", mcid=" + mcid +
+ '}';
+ }
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index a63754e..41644bf 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -24,16 +24,18 @@ import java.util.Arrays;
import java.util.Calendar;
import java.util.Collections;
import java.util.List;
-import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.tika.config.Field;
@@ -148,11 +150,15 @@ public class PDFParser extends AbstractParser implements Initializable {
if (handler != null) {
boolean hasXFA = hasXFA(pdfDocument);
metadata.set(PDF.HAS_XFA, Boolean.toString(hasXFA));
+ boolean hasMarkedContent = hasMarkedContent(pdfDocument);
+ metadata.set(PDF.HAS_MARKED_CONTENT, Boolean.toString(hasMarkedContent));
if (shouldHandleXFAOnly(hasXFA, localConfig)) {
handleXFAOnly(pdfDocument, handler, metadata, context);
} else if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
+ } else if (hasMarkedContent && localConfig.getExtractMarkedContent()) {
+ PDFMarkedContent2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
} else {
if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
@@ -170,6 +176,28 @@ public class PDFParser extends AbstractParser implements Initializable {
}
}
+ private boolean hasMarkedContent(PDDocument pdDocument) {
+ PDStructureTreeRoot root = pdDocument.getDocumentCatalog().getStructureTreeRoot();
+ if (root == null) {
+ return false;
+ }
+ COSBase base = root.getK();
+ if (base == null) {
+ return false;
+ }
+ //TODO: are there other checks we need to perform?
+ if (base instanceof COSDictionary) {
+ if (((COSDictionary)base).keySet().size() > 0) {
+ return true;
+ }
+ } else if (base instanceof COSArray) {
+ if (((COSArray) base).size() > 0) {
+ return true;
+ }
+ }
+ return false;
+ }
+
private String getPassword(Metadata metadata, ParseContext context) {
String password = null;
@@ -506,6 +534,11 @@ public class PDFParser extends AbstractParser implements Initializable {
}
@Field
+ void setExtractMarkedContent(boolean extractMarkedContent) {
+ defaultConfig.setExtractMarkedContent(extractMarkedContent);
+ }
+
+ @Field
void setInitializableProblemHander(String name) {
if ("ignore".equals(name)) {
setInitializableProblemHandler(InitializableProblemHandler.IGNORE);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 44324c2..178a5f8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -1,5 +1,3 @@
-package org.apache.tika.parser.pdf;
-
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,6 +14,7 @@ package org.apache.tika.parser.pdf;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+package org.apache.tika.parser.pdf;
import java.io.IOException;
import java.io.InputStream;
@@ -110,6 +109,10 @@ public class PDFParserConfig implements Serializable {
//a pdf file) should only be extracted once.
private boolean extractUniqueInlineImagesOnly = true;
+ //Should the PDFParser _try_ to extract marked content/structure tags (backoff to regular
+ //text extraction if the given PDF doesn't have marked content)
+ private boolean extractMarkedContent = false;
+
//The character width-based tolerance value used to estimate where spaces in text should be added
private Float averageCharTolerance;
@@ -228,6 +231,8 @@ public class PDFParserConfig implements Serializable {
setExtractActions(getBooleanProp(props.getProperty("extractActions"), false));
+ setExtractMarkedContent(getBooleanProp(props.getProperty("extractMarkedContent"), false));
+
setSetKCMS(getBooleanProp(props.getProperty("setKCMS"), false));
boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
@@ -246,6 +251,22 @@ public class PDFParserConfig implements Serializable {
}
/**
+ * If the PDF contains marked content, try to extract text and its marked structure.
+ * If the PDF does not contain marked content, backoff to the regular PDF2XHTML for
+ * text extraction. As of 1.24, this is an "alpha" version.
+ *
+ * @param extractMarkedContent
+ * @since 1.24
+ */
+ public void setExtractMarkedContent(boolean extractMarkedContent) {
+ this.extractMarkedContent = extractMarkedContent;
+ }
+
+ public boolean getExtractMarkedContent() {
+ return extractMarkedContent;
+ }
+
+ /**
* Configures the given pdf2XHTML.
*
* @param pdf2XHTML
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTMLTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTMLTest.java
new file mode 100644
index 0000000..ab30c16
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTMLTest.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+
+public class PDFMarkedContent2XHTMLTest extends TikaTest {
+
+ static ParseContext MARKUP_CONTEXT = new ParseContext();
+
+ @BeforeClass
+ public static void setUp() {
+ PDFParserConfig config = new PDFParserConfig();
+ config.setExtractMarkedContent(true);
+
+ MARKUP_CONTEXT.set(PDFParserConfig.class, config);
+ }
+
+ @Test
+ public void testJournal() throws Exception {
+ String xml = getXML("testJournalParser.pdf", MARKUP_CONTEXT).xml;
+ assertContains("<h1>I. INTRODUCTION</h1>", xml);
+ assertContains("<table><tr>\t<td><p />", xml);
+ assertContains("</td>\t<td><p>NHG</p>", xml);
+ assertContains("</td>\t<td><p>STRING</p>", xml);
+ }
+
+ @Test
+ public void testVarious() throws Exception {
+ String xml = getXML("testPDFVarious.pdf", MARKUP_CONTEXT).xml;
+ assertContains("<div class=\"textbox\"><p>Here is a text box</p>", xml);
+ assertContains("<div class=\"footnote\"><p>1 This is a footnote.</p>", xml);
+ assertContains("<ul>\t<li>Bullet 1</li>", xml);
+ assertContains("<table><tr>\t<td><p>Row 1 Col 1</p>", xml);
+ assertContains("<p>Here is a citation:</p>", xml);
+ assertContains("a href=\"http://tika.apache.org/\">This is a hyperlink</a>", xml);
+ assertContains("This is the header text.", xml);
+ assertContains("This is the footer text.", xml);
+ }
+
+ @Test
+ public void testChildAttachments() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testPDF_childAttachments.pdf", MARKUP_CONTEXT);
+
+ //make sure that embedded docs are still getting extracted
+ assertEquals(3, metadataList.size());
+
+ String xml = metadataList.get(0).get(RecursiveParserWrapperHandler.TIKA_CONTENT);
+ System.out.println(xml);
+ //the point here is that in the annotations (that we were grabbing by the classic PDF2XHTML),
+ //the <a> content is identical to the href. Here, they are not, which we only get from
+ //marked up content...victory!!!
+ assertContains("<a href=\"http://www.irs.gov\">IRS.gov</a>", xml);
+ assertContains("<a href=\"http://www.irs.gov/pub15\">www.irs.gov/pub15</a>", xml);
+ }
+
+}
\ No newline at end of file