You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/31 15:48:22 UTC
[tika] branch master updated: TIKA-2917 -- extract metadata that
accompanies inline images
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 8632510 TIKA-2917 -- extract metadata that accompanies inline images
8632510 is described below
commit 86325105ab206dca88d076dc865fcb17404c4531
Author: TALLISON <ta...@apache.org>
AuthorDate: Wed Jul 31 11:46:05 2019 -0400
TIKA-2917 -- extract metadata that accompanies inline images
---
.../tika/parser/image/xmp/JempboxExtractor.java | 4 +-
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 2 +-
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 177 ++++++-------
.../java/org/apache/tika/parser/pdf/PDFParser.java | 273 +++------------------
.../tika/parser/pdf/PDMetadataExtractor.java | 270 ++++++++++++++++++++
5 files changed, 393 insertions(+), 333 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
index 0f4f73b..de189cc 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
@@ -208,7 +208,9 @@ public class JempboxExtractor {
private static void addMetadata(Metadata m, Property p, String value) {
if (value != null) {
- m.add(p, value);
+ if (p.isMultiValuePermitted() || m.get(p) == null) {
+ m.add(p, value);
+ }
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 8116593..27e4df8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -146,7 +146,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
final List<IOException> exceptions = new ArrayList<>();
final PDDocument pdDocument;
final XHTMLContentHandler xhtml;
- private final ParseContext context;
+ final ParseContext context;
final Metadata metadata;
final EmbeddedDocumentExtractor embeddedDocumentExtractor;
final PDFParserConfig config;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 8f72429..079bd26 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -16,6 +16,19 @@
*/
package org.apache.tika.parser.pdf;
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.Writer;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
@@ -45,19 +58,6 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
-import java.awt.image.BufferedImage;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.Writer;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
/**
* Utility class that overrides the {@link PDFTextStripper} functionality
* to produce a semi-structured XHTML SAX events instead of a plain text
@@ -191,88 +191,93 @@ class PDF2XHTML extends AbstractPDF2XHTML {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
continue;
}
+ processImageObject(object, seenThisPage);
+ }
+ }
- if (object == null) {
- continue;
+ private void processImageObject(PDXObject object, Set<COSBase> seenThisPage) throws SAXException, IOException {
+ if (object == null) {
+ return;
+ }
+ COSStream cosStream = object.getCOSObject();
+ if (seenThisPage.contains(cosStream)) {
+ //avoid infinite recursion TIKA-1742
+ return;
+ }
+ seenThisPage.add(cosStream);
+
+ if (object instanceof PDFormXObject) {
+ extractImages(((PDFormXObject) object).getResources(), seenThisPage);
+ } else if (object instanceof PDImageXObject) {
+
+ PDImageXObject image = (PDImageXObject) object;
+
+ Metadata embeddedMetadata = new Metadata();
+ String extension = image.getSuffix();
+
+ if (extension == null || extension.equals("png")) {
+ embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/png");
+ extension = "png";
+ } else if (extension.equals("jpg")) {
+ embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ } else if (extension.equals("tiff")) {
+ embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/tiff");
+ extension = "tif";
+ } else if (extension.equals("jpx")) {
+ embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jp2");
+ } else if (extension.equals("jb2")) {
+ embeddedMetadata.set(
+ Metadata.CONTENT_TYPE, "image/x-jbig2");
+ } else {
+ //TODO: determine if we need to add more image types
+// throw new RuntimeException("EXTEN:" + extension);
}
- COSStream cosStream = object.getCOSObject();
- if (seenThisPage.contains(cosStream)) {
- //avoid infinite recursion TIKA-1742
- continue;
+ Integer imageNumber = processedInlineImages.get(cosStream);
+ if (imageNumber == null) {
+ imageNumber = inlineImageCounter++;
}
- seenThisPage.add(cosStream);
-
- if (object instanceof PDFormXObject) {
- extractImages(((PDFormXObject) object).getResources(), seenThisPage);
- } else if (object instanceof PDImageXObject) {
-
- PDImageXObject image = (PDImageXObject) object;
-
- Metadata embeddedMetadata = new Metadata();
- String extension = image.getSuffix();
-
- if (extension == null || extension.equals("png")) {
- embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/png");
- extension = "png";
- } else if (extension.equals("jpg")) {
- embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
- } else if (extension.equals("tiff")) {
- embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/tiff");
- extension = "tif";
- } else if (extension.equals("jpx")) {
- embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jp2");
- } else if (extension.equals("jb2")) {
- embeddedMetadata.set(
- Metadata.CONTENT_TYPE, "image/x-jbig2");
- } else {
- //TODO: determine if we need to add more image types
-// throw new RuntimeException("EXTEN:" + extension);
- }
- Integer imageNumber = processedInlineImages.get(cosStream);
- if (imageNumber == null) {
- imageNumber = inlineImageCounter++;
- }
- String fileName = "image" + imageNumber + "."+extension;
- embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
-
- // Output the img tag
- AttributesImpl attr = new AttributesImpl();
- attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
- attr.addAttribute("", "alt", "alt", "CDATA", fileName);
- xhtml.startElement("img", attr);
- xhtml.endElement("img");
-
- //Do we only want to process unique COSObject ids?
- //If so, have we already processed this one?
- if (config.getExtractUniqueInlineImagesOnly() == true) {
- if (processedInlineImages.containsKey(cosStream)) {
- continue;
- }
- processedInlineImages.put(cosStream, imageNumber);
+ String fileName = "image" + imageNumber + "." + extension;
+ embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
+
+ // Output the img tag
+ AttributesImpl attr = new AttributesImpl();
+ attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
+ attr.addAttribute("", "alt", "alt", "CDATA", fileName);
+ xhtml.startElement("img", attr);
+ xhtml.endElement("img");
+
+ //Do we only want to process unique COSObject ids?
+ //If so, have we already processed this one?
+ if (config.getExtractUniqueInlineImagesOnly() == true) {
+ if (processedInlineImages.containsKey(cosStream)) {
+ return;
}
+ processedInlineImages.put(cosStream, imageNumber);
+ }
- embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
- TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+ embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
- if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
- ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+ if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+ try {
+ //extract the metadata contained outside of the image
+ PDMetadataExtractor.extract(image.getMetadata(),
+ embeddedMetadata, context);
try {
- //TODO: handle image.getMetadata()?
- try {
- writeToBuffer(image, extension, buffer);
- } catch (IOException e) {
- EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
- continue;
- }
- try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) {
- embeddedDocumentExtractor.parseEmbedded(
- embeddedIs,
- new EmbeddedContentHandler(xhtml),
- embeddedMetadata, false);
- }
+ writeToBuffer(image, extension, buffer);
} catch (IOException e) {
- handleCatchableIOE(e);
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
+ return;
+ }
+ try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) {
+ embeddedDocumentExtractor.parseEmbedded(
+ embeddedIs,
+ new EmbeddedContentHandler(xhtml),
+ embeddedMetadata, false);
}
+ } catch (IOException e) {
+ handleCatchableIOE(e);
}
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 9840437..38c367d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -230,50 +230,38 @@ public class PDFParser extends AbstractParser implements Initializable {
metadata.set(TikaCoreProperties.LANGUAGE, document.getDocumentCatalog().getLanguage());
}
- //now go for the XMP
- Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context);
+ PDMetadataExtractor.extract(document.getDocumentCatalog().getMetadata(), metadata, context);
- XMPMetadata xmp = null;
- if (dom != null) {
- xmp = new XMPMetadata(dom);
+ PDDocumentInformation info = document.getDocumentInformation();
+ metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
+ PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle());
+ PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor());
+ //if this wasn't already set by xmp, use doc info
+ if (metadata.get(TikaCoreProperties.CREATOR) == null) {
+ PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor());
+ }
+ if (metadata.get(TikaCoreProperties.TITLE) == null) {
+ PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle());
}
- XMPSchemaDublinCore dcSchema = null;
+ PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator());
+ PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
+ PDMetadataExtractor.addMetadata(metadata, Office.KEYWORDS, info.getKeywords());
+ PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords());
+ PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer());
- if (xmp != null) {
- try {
- dcSchema = xmp.getDublinCoreSchema();
- } catch (IOException e) {}
+ PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject());
- JempboxExtractor.extractXMPMM(xmp, metadata);
- }
+ PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.SUBJECT, info.getKeywords());
+ PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.SUBJECT, info.getSubject());
+ PDMetadataExtractor.addMetadata(metadata, OfficeOpenXMLCore.SUBJECT, info.getSubject());
- PDDocumentInformation info = document.getDocumentInformation();
- metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
- extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
- addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle());
- extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
- addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor());
- extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
- addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
- addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator());
- addMetadata(metadata, Office.KEYWORDS, info.getKeywords());
- addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords());
- addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer());
- extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);
-
- addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject());
-
- addMetadata(metadata, TikaCoreProperties.SUBJECT, info.getKeywords());
- addMetadata(metadata, TikaCoreProperties.SUBJECT, info.getSubject());
- addMetadata(metadata, OfficeOpenXMLCore.SUBJECT, info.getSubject());
-
- addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped());
+ PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped());
Calendar created = info.getCreationDate();
- addMetadata(metadata, PDF.DOC_INFO_CREATED, created);
- addMetadata(metadata, TikaCoreProperties.CREATED, created);
+ PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_CREATED, created);
+ PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.CREATED, created);
Calendar modified = info.getModificationDate();
- addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
- addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, modified);
+ PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
+ PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, modified);
// All remaining metadata is custom
// Copy this over as-is
@@ -282,8 +270,8 @@ public class PDFParser extends AbstractParser implements Initializable {
for (COSName key : info.getCOSObject().keySet()) {
String name = key.getName();
if (!handledMetadata.contains(name)) {
- addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
- addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name,
+ PDMetadataExtractor.addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
+ PDMetadataExtractor.addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name,
info.getCOSObject().getDictionaryObject(key));
}
}
@@ -298,27 +286,7 @@ public class PDFParser extends AbstractParser implements Initializable {
MEDIA_TYPE.toString() + "; version=" +
Float.toString(document.getDocument().getVersion()));
- try {
- if (xmp != null) {
- xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
- XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
- if (pdfaxmp != null) {
- if (pdfaxmp.getPart() != null) {
- metadata.set(PDF.PDFAID_PART, Integer.toString(pdfaxmp.getPart()));
- }
- if (pdfaxmp.getConformance() != null) {
- metadata.set(PDF.PDFAID_CONFORMANCE, pdfaxmp.getConformance());
- String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
- metadata.set(PDF.PDFA_VERSION, version);
- metadata.add(TikaCoreProperties.FORMAT.getName(),
- MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
- }
- }
- // TODO WARN if this XMP version is inconsistent with document header version?
- }
- } catch (IOException e) {
- metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
- }
+
//TODO: Let's try to move this into PDFBox.
//Attempt to determine Adobe extension level, if present:
COSDictionary root = document.getDocumentCatalog().getCOSObject();
@@ -346,169 +314,6 @@ public class PDFParser extends AbstractParser implements Initializable {
}
}
- /**
- * Try to extract all multilingual items from the XMPSchema
- * <p/>
- * This relies on the property having a valid xmp getName()
- * <p/>
- * For now, this only extracts the first language if the property does not allow multiple values (see TIKA-1295)
- *
- * @param metadata
- * @param property
- * @param pdfBoxBaseline
- * @param schema
- */
- private void extractMultilingualItems(Metadata metadata, Property property,
- String pdfBoxBaseline, XMPSchema schema) {
- //if schema is null, just go with pdfBoxBaseline
- if (schema == null) {
- if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
- addMetadata(metadata, property, pdfBoxBaseline);
- }
- return;
- }
-
- for (String lang : schema.getLanguagePropertyLanguages(property.getName())) {
- String value = schema.getLanguageProperty(property.getName(), lang);
-
- if (value != null && value.length() > 0) {
- //if you're going to add it below in the baseline addition, don't add it now
- if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) {
- continue;
- }
- addMetadata(metadata, property, value);
- if (!property.isMultiValuePermitted()) {
- return;
- }
- }
- }
-
- if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
- //if we've already added something above and multivalue is not permitted
- //return.
- if (!property.isMultiValuePermitted()) {
- if (metadata.get(property) != null) {
- return;
- }
- }
- addMetadata(metadata, property, pdfBoxBaseline);
- }
- }
-
-
- /**
- * This tries to read a list from a particular property in
- * XMPSchemaDublinCore.
- * If it can't find the information, it falls back to the
- * pdfboxBaseline. The pdfboxBaseline should be the value
- * that pdfbox returns from its PDDocumentInformation object
- * (e.g. getAuthor()) This method is designed include the pdfboxBaseline,
- * and it should not duplicate the pdfboxBaseline.
- * <p/>
- * Until PDFBOX-1803/TIKA-1233 are fixed, do not call this
- * on dates!
- * <p/>
- * This relies on the property having a DublinCore compliant getName()
- *
- * @param property
- * @param pdfBoxBaseline
- * @param dc
- * @param metadata
- */
- private void extractDublinCoreListItems(Metadata metadata, Property property,
- String pdfBoxBaseline, XMPSchemaDublinCore dc) {
- //if no dc, add baseline and return
- if (dc == null) {
- if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
- addMetadata(metadata, property, pdfBoxBaseline);
- }
- return;
- }
- List<String> items = getXMPBagOrSeqList(dc, property.getName());
- if (items == null) {
- if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
- addMetadata(metadata, property, pdfBoxBaseline);
- }
- return;
- }
- for (String item : items) {
- if (pdfBoxBaseline != null && !item.equals(pdfBoxBaseline)) {
- addMetadata(metadata, property, item);
- }
- }
- //finally, add the baseline
- if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
- addMetadata(metadata, property, pdfBoxBaseline);
- }
- }
-
- /**
- * As of this writing, XMPSchema can contain bags or sequence lists
- * for some attributes...despite standards documentation.
- * JempBox expects one or the other for specific attributes.
- * Until more flexibility is added to JempBox, Tika will have to handle both.
- *
- * @param schema
- * @param name
- * @return list of values or null
- */
- private List<String> getXMPBagOrSeqList(XMPSchema schema, String name) {
- List<String> ret = schema.getBagList(name);
- if (ret == null) {
- ret = schema.getSequenceList(name);
- }
- return ret;
- }
-
- private void addMetadata(Metadata metadata, Property property, String value) {
- if (value != null) {
- String decoded = decode(value);
- if (property.isMultiValuePermitted() || metadata.get(property) == null) {
- metadata.add(property, decoded);
- }
- //silently skip adding property that already exists if multiple values are not permitted
- }
- }
-
- private void addMetadata(Metadata metadata, String name, String value) {
- if (value != null) {
- metadata.add(name, decode(value));
- }
- }
-
- private String decode(String value) {
- if (PDFEncodedStringDecoder.shouldDecode(value)) {
- PDFEncodedStringDecoder d = new PDFEncodedStringDecoder();
- return d.decode(value);
- }
- return value;
- }
-
-
- private void addMetadata(Metadata metadata, Property property, Calendar value) {
- if (value != null) {
- metadata.set(property, value);
- }
- }
-
- /**
- * Used when processing custom metadata entries, as PDFBox won't do
- * the conversion for us in the way it does for the standard ones
- */
- private void addMetadata(Metadata metadata, String name, COSBase value) {
- if (value instanceof COSArray) {
- for (Object v : ((COSArray) value).toList()) {
- addMetadata(metadata, name, ((COSBase) v));
- }
- } else if (value instanceof COSString) {
- addMetadata(metadata, name, ((COSString) value).getString());
- }
- // Avoid calling COSDictionary#toString, since it can lead to infinite
- // recursion. See TIKA-1038 and PDFBOX-1835.
- else if (value != null && !(value instanceof COSDictionary)) {
- addMetadata(metadata, name, value.toString());
- }
- }
private boolean shouldHandleXFAOnly(PDDocument pdDocument, PDFParserConfig config) {
@@ -725,28 +530,6 @@ public class PDFParser extends AbstractParser implements Initializable {
public void setInitializableProblemHandler(InitializableProblemHandler initializableProblemHandler) {
this.initializableProblemHandler = initializableProblemHandler;
}
- //can return null!
- private Document loadDOM(PDMetadata pdMetadata, Metadata metadata, ParseContext context) {
- if (pdMetadata == null) {
- return null;
- }
- InputStream is = null;
- try {
- try {
- is = pdMetadata.exportXMPMetadata();
- } catch (IOException e) {
- EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
- return null;
- }
- return XMLReaderUtils.buildDOM(is, context);
- } catch (IOException|SAXException|TikaException e) {
- EmbeddedDocumentUtil.recordException(e, metadata);
- } finally {
- IOUtils.closeQuietly(is);
- }
- return null;
-
- }
/**
* This is a no-op. There is no need to initialize multiple fields.
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
new file mode 100644
index 0000000..984a38b
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Calendar;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.jempbox.xmp.XMPMetadata;
+import org.apache.jempbox.xmp.XMPSchema;
+import org.apache.jempbox.xmp.XMPSchemaDublinCore;
+import org.apache.jempbox.xmp.pdfa.XMPSchemaPDFAId;
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSString;
+import org.apache.pdfbox.pdmodel.common.PDMetadata;
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PDF;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.image.xmp.JempboxExtractor;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.w3c.dom.Document;
+import org.xml.sax.SAXException;
+
+class PDMetadataExtractor {
+
+ private static final MediaType MEDIA_TYPE = MediaType.application("pdf");
+
+
+ static void extract(PDMetadata pdMetadata, Metadata metadata, ParseContext context) {
+ if (pdMetadata == null) {
+ return;
+ }
+ //now go for the XMP
+ Document dom = loadDOM(pdMetadata, metadata, context);
+
+ XMPMetadata xmp = null;
+ if (dom != null) {
+ xmp = new XMPMetadata(dom);
+ }
+ XMPSchemaDublinCore dcSchema = null;
+
+ if (xmp != null) {
+ try {
+ dcSchema = xmp.getDublinCoreSchema();
+ } catch (IOException e) {
+ }
+
+ JempboxExtractor.extractXMPMM(xmp, metadata);
+ }
+
+ extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);
+ extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, dcSchema);
+ extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, dcSchema);
+ extractMultilingualItems(metadata, TikaCoreProperties.TITLE, null, dcSchema);
+
+ try {
+ if (xmp != null) {
+ xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
+ XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
+ if (pdfaxmp != null) {
+ if (pdfaxmp.getPart() != null) {
+ metadata.set(PDF.PDFAID_PART, Integer.toString(pdfaxmp.getPart()));
+ }
+ if (pdfaxmp.getConformance() != null) {
+ metadata.set(PDF.PDFAID_CONFORMANCE, pdfaxmp.getConformance());
+ String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
+ metadata.set(PDF.PDFA_VERSION, version);
+ metadata.add(TikaCoreProperties.FORMAT.getName(),
+ MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
+ }
+ }
+ // TODO WARN if this XMP version is inconsistent with document header version?
+ }
+ } catch (IOException e) {
+ metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
+ }
+ }
+
+ /**
+ * As of this writing, XMPSchema can contain bags or sequence lists
+ * for some attributes...despite standards documentation.
+ * JempBox expects one or the other for specific attributes.
+ * Until more flexibility is added to JempBox, Tika will have to handle both.
+ *
+ * @param schema
+ * @param name
+ * @return list of values or null
+ */
+ static List<String> getXMPBagOrSeqList(XMPSchema schema, String name) {
+ List<String> ret = schema.getBagList(name);
+ if (ret == null) {
+ ret = schema.getSequenceList(name);
+ }
+ return ret;
+ }
+
+ /**
+ * Try to extract all multilingual items from the XMPSchema
+ * <p/>
+ * This relies on the property having a valid xmp getName()
+ * <p/>
+ * For now, this only extracts the first language if the property does not allow multiple values (see TIKA-1295)
+ *
+ * @param metadata
+ * @param property
+ * @param pdfBoxBaseline
+ * @param schema
+ */
+ private static void extractMultilingualItems(Metadata metadata, Property property,
+ String pdfBoxBaseline, XMPSchema schema) {
+ //if schema is null, just go with pdfBoxBaseline
+ if (schema == null) {
+ if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
+ addMetadata(metadata, property, pdfBoxBaseline);
+ }
+ return;
+ }
+
+ for (String lang : schema.getLanguagePropertyLanguages(property.getName())) {
+ String value = schema.getLanguageProperty(property.getName(), lang);
+
+ if (value != null && value.length() > 0) {
+ //if you're going to add it below in the baseline addition, don't add it now
+ if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) {
+ continue;
+ }
+ addMetadata(metadata, property, value);
+ if (!property.isMultiValuePermitted()) {
+ return;
+ }
+ }
+ }
+
+ if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
+ //if we've already added something above and multivalue is not permitted
+ //return.
+ if (!property.isMultiValuePermitted()) {
+ if (metadata.get(property) != null) {
+ return;
+ }
+ }
+ addMetadata(metadata, property, pdfBoxBaseline);
+ }
+ }
+
+
+ /**
+ * This tries to read a list from a particular property in
+ * XMPSchemaDublinCore.
+ * <p/>
+ * Until PDFBOX-1803/TIKA-1233 are fixed, do not call this
+ * on dates!
+ * <p/>
+ * This relies on the property having a DublinCore compliant getName()
+ *
+ * @param property
+ * @param dc
+ * @param metadata
+ */
+ private static void extractDublinCoreListItems(Metadata metadata, Property property, XMPSchemaDublinCore dc) {
+ //if no dc, add baseline and return
+ if (dc == null) {
+ return;
+ }
+ List<String> items = getXMPBagOrSeqList(dc, property.getName());
+ if (items == null) {
+ return;
+ }
+ for (String item : items) {
+ addMetadata(metadata, property, item);
+ }
+ }
+
+
+ static void addMetadata(Metadata metadata, Property property, String value) {
+ if (value != null) {
+ String decoded = decode(value);
+ if (property.isMultiValuePermitted() || metadata.get(property) == null) {
+ metadata.add(property, decoded);
+ }
+ //silently skip adding property that already exists if multiple values are not permitted
+ }
+ }
+
+ static void addMetadata(Metadata metadata, String name, String value) {
+ if (value != null) {
+ metadata.add(name, decode(value));
+ }
+ }
+
+ static String decode(String value) {
+ if (PDFEncodedStringDecoder.shouldDecode(value)) {
+ PDFEncodedStringDecoder d = new PDFEncodedStringDecoder();
+ return d.decode(value);
+ }
+ return value;
+ }
+
+ //can return null!
+ private static Document loadDOM(PDMetadata pdMetadata, Metadata metadata, ParseContext context) {
+ if (pdMetadata == null) {
+ return null;
+ }
+ InputStream is = null;
+ try {
+ try {
+ is = pdMetadata.exportXMPMetadata();
+ } catch (IOException e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
+ return null;
+ }
+ return XMLReaderUtils.buildDOM(is, context);
+ } catch (IOException| SAXException | TikaException e) {
+ EmbeddedDocumentUtil.recordException(e, metadata);
+ } finally {
+ IOUtils.closeQuietly(is);
+ }
+ return null;
+
+ }
+
+ static void addMetadata(Metadata metadata, Property property, Calendar value) {
+ if (value != null) {
+ metadata.set(property, value);
+ }
+ }
+
+ /**
+ * Used when processing custom metadata entries, as PDFBox won't do
+ * the conversion for us in the way it does for the standard ones
+ */
+ static void addMetadata(Metadata metadata, String name, COSBase value) {
+ if (value instanceof COSArray) {
+ for (Object v : ((COSArray) value).toList()) {
+ addMetadata(metadata, name, ((COSBase) v));
+ }
+ } else if (value instanceof COSString) {
+ addMetadata(metadata, name, ((COSString) value).getString());
+ }
+ // Avoid calling COSDictionary#toString, since it can lead to infinite
+ // recursion. See TIKA-1038 and PDFBOX-1835.
+ else if (value != null && !(value instanceof COSDictionary)) {
+ addMetadata(metadata, name, value.toString());
+ }
+ }
+}