You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/31 15:48:22 UTC

[tika] branch master updated: TIKA-2917 -- extract metadata that accompanies inline images

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 8632510  TIKA-2917 -- extract metadata that accompanies inline images
8632510 is described below

commit 86325105ab206dca88d076dc865fcb17404c4531
Author: TALLISON <ta...@apache.org>
AuthorDate: Wed Jul 31 11:46:05 2019 -0400

    TIKA-2917 -- extract metadata that accompanies inline images
---
 .../tika/parser/image/xmp/JempboxExtractor.java    |   4 +-
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |   2 +-
 .../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 177 ++++++-------
 .../java/org/apache/tika/parser/pdf/PDFParser.java | 273 +++------------------
 .../tika/parser/pdf/PDMetadataExtractor.java       | 270 ++++++++++++++++++++
 5 files changed, 393 insertions(+), 333 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
index 0f4f73b..de189cc 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
@@ -208,7 +208,9 @@ public class JempboxExtractor {
 
     private static void addMetadata(Metadata m, Property p, String value) {
         if (value != null) {
-            m.add(p, value);
+            if (p.isMultiValuePermitted() || m.get(p) == null) {
+                m.add(p, value);
+            }
         }
     }
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 8116593..27e4df8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -146,7 +146,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     final List<IOException> exceptions = new ArrayList<>();
     final PDDocument pdDocument;
     final XHTMLContentHandler xhtml;
-    private final ParseContext context;
+    final ParseContext context;
     final Metadata metadata;
     final EmbeddedDocumentExtractor embeddedDocumentExtractor;
     final PDFParserConfig config;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 8f72429..079bd26 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -16,6 +16,19 @@
  */
 package org.apache.tika.parser.pdf;
 
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.Writer;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
 import org.apache.pdfbox.cos.COSArray;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSName;
@@ -45,19 +58,6 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
-import java.awt.image.BufferedImage;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.Writer;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
 /**
  * Utility class that overrides the {@link PDFTextStripper} functionality
  * to produce a semi-structured XHTML SAX events instead of a plain text
@@ -191,88 +191,93 @@ class PDF2XHTML extends AbstractPDF2XHTML {
                 EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
                 continue;
             }
+            processImageObject(object, seenThisPage);
+        }
+    }
 
-            if (object == null) {
-                continue;
+    private void processImageObject(PDXObject object, Set<COSBase> seenThisPage) throws SAXException, IOException {
+        if (object == null) {
+            return;
+        }
+        COSStream cosStream = object.getCOSObject();
+        if (seenThisPage.contains(cosStream)) {
+            //avoid infinite recursion TIKA-1742
+            return;
+        }
+        seenThisPage.add(cosStream);
+
+        if (object instanceof PDFormXObject) {
+            extractImages(((PDFormXObject) object).getResources(), seenThisPage);
+        } else if (object instanceof PDImageXObject) {
+
+            PDImageXObject image = (PDImageXObject) object;
+
+            Metadata embeddedMetadata = new Metadata();
+            String extension = image.getSuffix();
+
+            if (extension == null || extension.equals("png")) {
+                embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/png");
+                extension = "png";
+            } else if (extension.equals("jpg")) {
+                embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+            } else if (extension.equals("tiff")) {
+                embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/tiff");
+                extension = "tif";
+            } else if (extension.equals("jpx")) {
+                embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jp2");
+            } else if (extension.equals("jb2")) {
+                embeddedMetadata.set(
+                        Metadata.CONTENT_TYPE, "image/x-jbig2");
+            } else {
+                //TODO: determine if we need to add more image types
+//                    throw new RuntimeException("EXTEN:" + extension);
             }
-            COSStream cosStream = object.getCOSObject();
-            if (seenThisPage.contains(cosStream)) {
-                //avoid infinite recursion TIKA-1742
-                continue;
+            Integer imageNumber = processedInlineImages.get(cosStream);
+            if (imageNumber == null) {
+                imageNumber = inlineImageCounter++;
             }
-            seenThisPage.add(cosStream);
-
-            if (object instanceof PDFormXObject) {
-                extractImages(((PDFormXObject) object).getResources(), seenThisPage);
-            } else if (object instanceof PDImageXObject) {
-
-                PDImageXObject image = (PDImageXObject) object;
-
-                Metadata embeddedMetadata = new Metadata();
-                String extension = image.getSuffix();
-                
-                if (extension == null || extension.equals("png")) {
-                    embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/png");
-                    extension = "png";
-                } else if (extension.equals("jpg")) {
-                    embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
-                } else if (extension.equals("tiff")) {
-                    embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/tiff");
-                    extension = "tif";
-                } else if (extension.equals("jpx")) {
-                    embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jp2");
-                } else if (extension.equals("jb2")) {
-                    embeddedMetadata.set(
-                            Metadata.CONTENT_TYPE, "image/x-jbig2");
-                } else {
-                    //TODO: determine if we need to add more image types
-//                    throw new RuntimeException("EXTEN:" + extension);
-                }
-                Integer imageNumber = processedInlineImages.get(cosStream);
-                if (imageNumber == null) {
-                    imageNumber = inlineImageCounter++;
-                }
-                String fileName = "image" + imageNumber + "."+extension;
-                embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
-
-                // Output the img tag
-                AttributesImpl attr = new AttributesImpl();
-                attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
-                attr.addAttribute("", "alt", "alt", "CDATA", fileName);
-                xhtml.startElement("img", attr);
-                xhtml.endElement("img");
-
-                //Do we only want to process unique COSObject ids?
-                //If so, have we already processed this one?
-                if (config.getExtractUniqueInlineImagesOnly() == true) {
-                    if (processedInlineImages.containsKey(cosStream)) {
-                        continue;
-                    }
-                    processedInlineImages.put(cosStream, imageNumber);
+            String fileName = "image" + imageNumber + "." + extension;
+            embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
+
+            // Output the img tag
+            AttributesImpl attr = new AttributesImpl();
+            attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
+            attr.addAttribute("", "alt", "alt", "CDATA", fileName);
+            xhtml.startElement("img", attr);
+            xhtml.endElement("img");
+
+            //Do we only want to process unique COSObject ids?
+            //If so, have we already processed this one?
+            if (config.getExtractUniqueInlineImagesOnly() == true) {
+                if (processedInlineImages.containsKey(cosStream)) {
+                    return;
                 }
+                processedInlineImages.put(cosStream, imageNumber);
+            }
 
-                embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
-                        TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+            embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                    TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
 
-                if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
-                    ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+            if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
+                ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+                try {
+                    //extract the metadata contained outside of the image
+                    PDMetadataExtractor.extract(image.getMetadata(),
+                            embeddedMetadata, context);
                     try {
-                        //TODO: handle image.getMetadata()?
-                        try {
-                            writeToBuffer(image, extension, buffer);
-                        } catch (IOException e) {
-                            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
-                            continue;
-                        }
-                        try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) {
-                            embeddedDocumentExtractor.parseEmbedded(
-                                    embeddedIs,
-                                    new EmbeddedContentHandler(xhtml),
-                                    embeddedMetadata, false);
-                        }
+                        writeToBuffer(image, extension, buffer);
                     } catch (IOException e) {
-                        handleCatchableIOE(e);
+                        EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
+                        return;
+                    }
+                    try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) {
+                        embeddedDocumentExtractor.parseEmbedded(
+                                embeddedIs,
+                                new EmbeddedContentHandler(xhtml),
+                                embeddedMetadata, false);
                     }
+                } catch (IOException e) {
+                    handleCatchableIOE(e);
                 }
             }
         }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 9840437..38c367d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -230,50 +230,38 @@ public class PDFParser extends AbstractParser implements Initializable {
             metadata.set(TikaCoreProperties.LANGUAGE, document.getDocumentCatalog().getLanguage());
         }
 
-        //now go for the XMP
-        Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context);
+        PDMetadataExtractor.extract(document.getDocumentCatalog().getMetadata(), metadata, context);
 
-        XMPMetadata xmp = null;
-        if (dom != null) {
-            xmp = new XMPMetadata(dom);
+        PDDocumentInformation info = document.getDocumentInformation();
+        metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
+        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle());
+        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor());
+        //if this wasn't already set by xmp, use doc info
+        if (metadata.get(TikaCoreProperties.CREATOR) == null) {
+            PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor());
+        }
+        if (metadata.get(TikaCoreProperties.TITLE) == null) {
+            PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle());
         }
-        XMPSchemaDublinCore dcSchema = null;
+        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator());
+        PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
+        PDMetadataExtractor.addMetadata(metadata, Office.KEYWORDS, info.getKeywords());
+        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords());
+        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer());
 
-        if (xmp != null) {
-            try {
-                dcSchema = xmp.getDublinCoreSchema();
-            } catch (IOException e) {}
+        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject());
 
-            JempboxExtractor.extractXMPMM(xmp, metadata);
-        }
+        PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.SUBJECT, info.getKeywords());
+        PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.SUBJECT, info.getSubject());
+        PDMetadataExtractor.addMetadata(metadata, OfficeOpenXMLCore.SUBJECT, info.getSubject());
 
-        PDDocumentInformation info = document.getDocumentInformation();
-        metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
-        extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
-        addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle());
-        extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
-        addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor());
-        extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
-        addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
-        addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator());
-        addMetadata(metadata, Office.KEYWORDS, info.getKeywords());
-        addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords());
-        addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer());
-        extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);
-
-        addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject());
-
-        addMetadata(metadata, TikaCoreProperties.SUBJECT, info.getKeywords());
-        addMetadata(metadata, TikaCoreProperties.SUBJECT, info.getSubject());
-        addMetadata(metadata, OfficeOpenXMLCore.SUBJECT, info.getSubject());
-
-        addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped());
+        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped());
         Calendar created = info.getCreationDate();
-        addMetadata(metadata, PDF.DOC_INFO_CREATED, created);
-        addMetadata(metadata, TikaCoreProperties.CREATED, created);
+        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_CREATED, created);
+        PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.CREATED, created);
         Calendar modified = info.getModificationDate();
-        addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
-        addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, modified);
+        PDMetadataExtractor.addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
+        PDMetadataExtractor.addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, modified);
 
         // All remaining metadata is custom
         // Copy this over as-is
@@ -282,8 +270,8 @@ public class PDFParser extends AbstractParser implements Initializable {
         for (COSName key : info.getCOSObject().keySet()) {
             String name = key.getName();
             if (!handledMetadata.contains(name)) {
-                addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
-                addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name,
+                PDMetadataExtractor.addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
+                PDMetadataExtractor.addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name,
                         info.getCOSObject().getDictionaryObject(key));
             }
         }
@@ -298,27 +286,7 @@ public class PDFParser extends AbstractParser implements Initializable {
                 MEDIA_TYPE.toString() + "; version=" +
                         Float.toString(document.getDocument().getVersion()));
 
-        try {
-            if (xmp != null) {
-                xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
-                XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
-                if (pdfaxmp != null) {
-                    if (pdfaxmp.getPart() != null) {
-                        metadata.set(PDF.PDFAID_PART, Integer.toString(pdfaxmp.getPart()));
-                    }
-                    if (pdfaxmp.getConformance() != null) {
-                        metadata.set(PDF.PDFAID_CONFORMANCE, pdfaxmp.getConformance());
-                        String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
-                        metadata.set(PDF.PDFA_VERSION, version);
-                        metadata.add(TikaCoreProperties.FORMAT.getName(),
-                                MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
-                    }
-                }
-                // TODO WARN if this XMP version is inconsistent with document header version?          
-            }
-        } catch (IOException e) {
-            metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
-        }
+
         //TODO: Let's try to move this into PDFBox.
         //Attempt to determine Adobe extension level, if present:
         COSDictionary root = document.getDocumentCatalog().getCOSObject();
@@ -346,169 +314,6 @@ public class PDFParser extends AbstractParser implements Initializable {
         }
     }
 
-    /**
-     * Try to extract all multilingual items from the XMPSchema
-     * <p/>
-     * This relies on the property having a valid xmp getName()
-     * <p/>
-     * For now, this only extracts the first language if the property does not allow multiple values (see TIKA-1295)
-     *
-     * @param metadata
-     * @param property
-     * @param pdfBoxBaseline
-     * @param schema
-     */
-    private void extractMultilingualItems(Metadata metadata, Property property,
-                                          String pdfBoxBaseline, XMPSchema schema) {
-        //if schema is null, just go with pdfBoxBaseline
-        if (schema == null) {
-            if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
-                addMetadata(metadata, property, pdfBoxBaseline);
-            }
-            return;
-        }
-
-        for (String lang : schema.getLanguagePropertyLanguages(property.getName())) {
-            String value = schema.getLanguageProperty(property.getName(), lang);
-
-            if (value != null && value.length() > 0) {
-                //if you're going to add it below in the baseline addition, don't add it now
-                if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) {
-                    continue;
-                }
-                addMetadata(metadata, property, value);
-                if (!property.isMultiValuePermitted()) {
-                    return;
-                }
-            }
-        }
-
-        if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
-            //if we've already added something above and multivalue is not permitted
-            //return.
-            if (!property.isMultiValuePermitted()) {
-                if (metadata.get(property) != null) {
-                    return;
-                }
-            }
-            addMetadata(metadata, property, pdfBoxBaseline);
-        }
-    }
-
-
-    /**
-     * This tries to read a list from a particular property in
-     * XMPSchemaDublinCore.
-     * If it can't find the information, it falls back to the
-     * pdfboxBaseline.  The pdfboxBaseline should be the value
-     * that pdfbox returns from its PDDocumentInformation object
-     * (e.g. getAuthor()) This method is designed include the pdfboxBaseline,
-     * and it should not duplicate the pdfboxBaseline.
-     * <p/>
-     * Until PDFBOX-1803/TIKA-1233 are fixed, do not call this
-     * on dates!
-     * <p/>
-     * This relies on the property having a DublinCore compliant getName()
-     *
-     * @param property
-     * @param pdfBoxBaseline
-     * @param dc
-     * @param metadata
-     */
-    private void extractDublinCoreListItems(Metadata metadata, Property property,
-                                            String pdfBoxBaseline, XMPSchemaDublinCore dc) {
-        //if no dc, add baseline and return
-        if (dc == null) {
-            if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
-                addMetadata(metadata, property, pdfBoxBaseline);
-            }
-            return;
-        }
-        List<String> items = getXMPBagOrSeqList(dc, property.getName());
-        if (items == null) {
-            if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
-                addMetadata(metadata, property, pdfBoxBaseline);
-            }
-            return;
-        }
-        for (String item : items) {
-            if (pdfBoxBaseline != null && !item.equals(pdfBoxBaseline)) {
-                addMetadata(metadata, property, item);
-            }
-        }
-        //finally, add the baseline
-        if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
-            addMetadata(metadata, property, pdfBoxBaseline);
-        }
-    }
-
-    /**
-     * As of this writing, XMPSchema can contain bags or sequence lists
-     * for some attributes...despite standards documentation.
-     * JempBox expects one or the other for specific attributes.
-     * Until more flexibility is added to JempBox, Tika will have to handle both.
-     *
-     * @param schema
-     * @param name
-     * @return list of values or null
-     */
-    private List<String> getXMPBagOrSeqList(XMPSchema schema, String name) {
-        List<String> ret = schema.getBagList(name);
-        if (ret == null) {
-            ret = schema.getSequenceList(name);
-        }
-        return ret;
-    }
-
-    private void addMetadata(Metadata metadata, Property property, String value) {
-        if (value != null) {
-            String decoded = decode(value);
-            if (property.isMultiValuePermitted() || metadata.get(property) == null) {
-                metadata.add(property, decoded);
-            }
-            //silently skip adding property that already exists if multiple values are not permitted
-        }
-    }
-
-    private void addMetadata(Metadata metadata, String name, String value) {
-        if (value != null) {
-            metadata.add(name, decode(value));
-        }
-    }
-
-    private String decode(String value) {
-        if (PDFEncodedStringDecoder.shouldDecode(value)) {
-            PDFEncodedStringDecoder d = new PDFEncodedStringDecoder();
-            return d.decode(value);
-        }
-        return value;
-    }
-
-
-    private void addMetadata(Metadata metadata, Property property, Calendar value) {
-        if (value != null) {
-            metadata.set(property, value);
-        }
-    }
-
-    /**
-     * Used when processing custom metadata entries, as PDFBox won't do
-     * the conversion for us in the way it does for the standard ones
-     */
-    private void addMetadata(Metadata metadata, String name, COSBase value) {
-        if (value instanceof COSArray) {
-            for (Object v : ((COSArray) value).toList()) {
-                addMetadata(metadata, name, ((COSBase) v));
-            }
-        } else if (value instanceof COSString) {
-            addMetadata(metadata, name, ((COSString) value).getString());
-        }
-        // Avoid calling COSDictionary#toString, since it can lead to infinite
-        // recursion. See TIKA-1038 and PDFBOX-1835.
-        else if (value != null && !(value instanceof COSDictionary)) {
-            addMetadata(metadata, name, value.toString());
-        }
-    }
 
 
     private boolean shouldHandleXFAOnly(PDDocument pdDocument, PDFParserConfig config) {
@@ -725,28 +530,6 @@ public class PDFParser extends AbstractParser implements Initializable {
     public void setInitializableProblemHandler(InitializableProblemHandler initializableProblemHandler) {
         this.initializableProblemHandler = initializableProblemHandler;
     }
-    //can return null!
-    private Document loadDOM(PDMetadata pdMetadata, Metadata metadata, ParseContext context) {
-        if (pdMetadata == null) {
-            return null;
-        }
-        InputStream is = null;
-        try {
-            try {
-                is = pdMetadata.exportXMPMetadata();
-            } catch (IOException e) {
-                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
-                return null;
-            }
-            return XMLReaderUtils.buildDOM(is, context);
-        } catch (IOException|SAXException|TikaException e) {
-            EmbeddedDocumentUtil.recordException(e, metadata);
-        } finally {
-            IOUtils.closeQuietly(is);
-        }
-        return null;
-
-    }
 
     /**
      * This is a no-op.  There is no need to initialize multiple fields.
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
new file mode 100644
index 0000000..984a38b
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Calendar;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.jempbox.xmp.XMPMetadata;
+import org.apache.jempbox.xmp.XMPSchema;
+import org.apache.jempbox.xmp.XMPSchemaDublinCore;
+import org.apache.jempbox.xmp.pdfa.XMPSchemaPDFAId;
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSString;
+import org.apache.pdfbox.pdmodel.common.PDMetadata;
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PDF;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.image.xmp.JempboxExtractor;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.w3c.dom.Document;
+import org.xml.sax.SAXException;
+
+class PDMetadataExtractor {
+
+    private static final MediaType MEDIA_TYPE = MediaType.application("pdf");
+
+
+    static void extract(PDMetadata pdMetadata, Metadata metadata, ParseContext context) {
+        if (pdMetadata == null) {
+            return;
+        }
+        //now go for the XMP
+        Document dom = loadDOM(pdMetadata, metadata, context);
+
+        XMPMetadata xmp = null;
+        if (dom != null) {
+            xmp = new XMPMetadata(dom);
+        }
+        XMPSchemaDublinCore dcSchema = null;
+
+        if (xmp != null) {
+            try {
+                dcSchema = xmp.getDublinCoreSchema();
+            } catch (IOException e) {
+            }
+
+            JempboxExtractor.extractXMPMM(xmp, metadata);
+        }
+
+        extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);
+        extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, dcSchema);
+        extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, dcSchema);
+        extractMultilingualItems(metadata, TikaCoreProperties.TITLE, null, dcSchema);
+
+        try {
+            if (xmp != null) {
+                xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
+                XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
+                if (pdfaxmp != null) {
+                    if (pdfaxmp.getPart() != null) {
+                        metadata.set(PDF.PDFAID_PART, Integer.toString(pdfaxmp.getPart()));
+                    }
+                    if (pdfaxmp.getConformance() != null) {
+                        metadata.set(PDF.PDFAID_CONFORMANCE, pdfaxmp.getConformance());
+                        String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
+                        metadata.set(PDF.PDFA_VERSION, version);
+                        metadata.add(TikaCoreProperties.FORMAT.getName(),
+                                MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
+                    }
+                }
+                // TODO WARN if this XMP version is inconsistent with document header version?
+            }
+        } catch (IOException e) {
+            metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
+        }
+    }
+
+    /**
+     * As of this writing, XMPSchema can contain bags or sequence lists
+     * for some attributes...despite standards documentation.
+     * JempBox expects one or the other for specific attributes.
+     * Until more flexibility is added to JempBox, Tika will have to handle both.
+     *
+     * @param schema
+     * @param name
+     * @return list of values or null
+     */
+    static List<String> getXMPBagOrSeqList(XMPSchema schema, String name) {
+        List<String> ret = schema.getBagList(name);
+        if (ret == null) {
+            ret = schema.getSequenceList(name);
+        }
+        return ret;
+    }
+
+    /**
+     * Try to extract all multilingual items from the XMPSchema
+     * <p/>
+     * This relies on the property having a valid xmp getName()
+     * <p/>
+     * For now, this only extracts the first language if the property does not allow multiple values (see TIKA-1295)
+     *
+     * @param metadata
+     * @param property
+     * @param pdfBoxBaseline
+     * @param schema
+     */
+    private static void extractMultilingualItems(Metadata metadata, Property property,
+                                          String pdfBoxBaseline, XMPSchema schema) {
+        //if schema is null, just go with pdfBoxBaseline
+        if (schema == null) {
+            if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
+                addMetadata(metadata, property, pdfBoxBaseline);
+            }
+            return;
+        }
+
+        for (String lang : schema.getLanguagePropertyLanguages(property.getName())) {
+            String value = schema.getLanguageProperty(property.getName(), lang);
+
+            if (value != null && value.length() > 0) {
+                //if you're going to add it below in the baseline addition, don't add it now
+                if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) {
+                    continue;
+                }
+                addMetadata(metadata, property, value);
+                if (!property.isMultiValuePermitted()) {
+                    return;
+                }
+            }
+        }
+
+        if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
+            //if we've already added something above and multivalue is not permitted
+            //return.
+            if (!property.isMultiValuePermitted()) {
+                if (metadata.get(property) != null) {
+                    return;
+                }
+            }
+            addMetadata(metadata, property, pdfBoxBaseline);
+        }
+    }
+
+
+    /**
+     * This tries to read a list from a particular property in
+     * XMPSchemaDublinCore.
+     * <p/>
+     * Until PDFBOX-1803/TIKA-1233 are fixed, do not call this
+     * on dates!
+     * <p/>
+     * This relies on the property having a DublinCore compliant getName()
+     *
+     * @param property
+     * @param dc
+     * @param metadata
+     */
+    private static void extractDublinCoreListItems(Metadata metadata, Property property, XMPSchemaDublinCore dc) {
+        //if no dc, add baseline and return
+        if (dc == null) {
+            return;
+        }
+        List<String> items = getXMPBagOrSeqList(dc, property.getName());
+        if (items == null) {
+            return;
+        }
+        for (String item : items) {
+            addMetadata(metadata, property, item);
+        }
+    }
+
+
+    static void addMetadata(Metadata metadata, Property property, String value) {
+        if (value != null) {
+            String decoded = decode(value);
+            if (property.isMultiValuePermitted() || metadata.get(property) == null) {
+                metadata.add(property, decoded);
+            }
+            //silently skip adding property that already exists if multiple values are not permitted
+        }
+    }
+
+    static void addMetadata(Metadata metadata, String name, String value) {
+        if (value != null) {
+            metadata.add(name, decode(value));
+        }
+    }
+
+    static String decode(String value) {
+        if (PDFEncodedStringDecoder.shouldDecode(value)) {
+            PDFEncodedStringDecoder d = new PDFEncodedStringDecoder();
+            return d.decode(value);
+        }
+        return value;
+    }
+
+    //can return null!
+    private static Document loadDOM(PDMetadata pdMetadata, Metadata metadata, ParseContext context) {
+        if (pdMetadata == null) {
+            return null;
+        }
+        InputStream is = null;
+        try {
+            try {
+                is = pdMetadata.exportXMPMetadata();
+            } catch (IOException e) {
+                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
+                return null;
+            }
+            return XMLReaderUtils.buildDOM(is, context);
+        } catch (IOException| SAXException | TikaException e) {
+            EmbeddedDocumentUtil.recordException(e, metadata);
+        } finally {
+            IOUtils.closeQuietly(is);
+        }
+        return null;
+
+    }
+
+    static void addMetadata(Metadata metadata, Property property, Calendar value) {
+        if (value != null) {
+            metadata.set(property, value);
+        }
+    }
+
+    /**
+     * Used when processing custom metadata entries, as PDFBox won't do
+     * the conversion for us in the way it does for the standard ones
+     */
+    static void addMetadata(Metadata metadata, String name, COSBase value) {
+        if (value instanceof COSArray) {
+            for (Object v : ((COSArray) value).toList()) {
+                addMetadata(metadata, name, ((COSBase) v));
+            }
+        } else if (value instanceof COSString) {
+            addMetadata(metadata, name, ((COSString) value).getString());
+        }
+        // Avoid calling COSDictionary#toString, since it can lead to infinite
+        // recursion. See TIKA-1038 and PDFBOX-1835.
+        else if (value != null && !(value instanceof COSDictionary)) {
+            addMetadata(metadata, name, value.toString());
+        }
+    }
+}