You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/03/02 23:09:22 UTC
[tika] 01/03: TIKA-3058 process page-level xmp if it exists
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit d645ae723b4d3df5a3bc559f420e58a8f3be8c49
Author: tallison <ta...@apache.org>
AuthorDate: Mon Mar 2 13:42:07 2020 -0500
TIKA-3058 process page-level xmp if it exists
---
.../main/java/org/apache/tika/metadata/PDF.java | 6 +++
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 52 ++++++++++++++++------
2 files changed, 44 insertions(+), 14 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index 0220948..608d5df 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -89,6 +89,12 @@ public interface PDF {
Property HAS_XMP = Property.internalBoolean(PDF_PREFIX+"hasXMP");
/**
+ * If xmp is extracted by, e.g. the XMLProfiler, where did it come from?
+ * The document document catalog or a specific page...or?
+ */
+ Property XMP_LOCATION = Property.internalText(PDF_PREFIX+"xmpLocation");
+
+ /**
* Has > 0 AcroForm fields
*/
Property HAS_ACROFORM_FIELDS = Property.internalBoolean(PDF_PREFIX+"hasAcroFormFields");
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 2e58123..c2eb77f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -100,6 +100,7 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
+import org.apache.tika.parser.sas.SAS7BDATParser;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
@@ -146,6 +147,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
private static final MediaType XFA_MEDIA_TYPE = MediaType.application("vnd.adobe.xdp+xml");
private static final MediaType XMP_MEDIA_TYPE = MediaType.application("rdf+xml");
+ public static final String XMP_DOCUMENT_CATALOG_LOCATION = "documentCatalog";
+ public static final String XMP_PAGE_LOCATION_PREFIX = "page ";
+
/**
* Format used for signature dates
* TODO Make this thread-safe
@@ -202,25 +206,26 @@ class AbstractPDF2XHTML extends PDFTextStripper {
supportedTypes = embeddedParser.getSupportedTypes(context);
}
- if (pdfDocument.getDocumentCatalog().getMetadata() != null) {
- Metadata xmpMetadata = new Metadata();
- xmpMetadata.set(Metadata.CONTENT_TYPE, XMP_MEDIA_TYPE.toString());
- xmpMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
- if (embeddedDocumentExtractor.shouldParseEmbedded(xmpMetadata) &&
- supportedTypes.contains(XMP_MEDIA_TYPE)) {
- InputStream is = null;
- try {
- is = pdfDocument.getDocumentCatalog().getMetadata().exportXMPMetadata();
+ if (supportedTypes.contains(XMP_MEDIA_TYPE)) {
+ //try the main metadata
+ if (pdfDocument.getDocumentCatalog().getMetadata() != null) {
+ try (InputStream is = pdfDocument.getDocumentCatalog().getMetadata().exportXMPMetadata()) {
+ extractXMPAsEmbeddedFile(is, XMP_DOCUMENT_CATALOG_LOCATION);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
}
- if (is != null) {
- try {
- parseMetadata(is, xmpMetadata);
- } finally {
- org.apache.tika.io.IOUtils.closeQuietly(is);
+ }
+ //now iterate through the pages
+ int pageNumber = 1;
+ for (PDPage page : pdfDocument.getPages()) {
+ if (page.getMetadata() != null) {
+ try (InputStream is = page.getMetadata().exportXMPMetadata()) {
+ extractXMPAsEmbeddedFile(is, XMP_PAGE_LOCATION_PREFIX+pageNumber);
+ } catch (IOException e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
}
}
+ pageNumber++;
}
}
@@ -248,6 +253,24 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
+ private void extractXMPAsEmbeddedFile(InputStream is, String location) throws IOException, SAXException {
+ if (is == null) {
+ return;
+ }
+ Metadata xmpMetadata = new Metadata();
+ xmpMetadata.set(Metadata.CONTENT_TYPE, XMP_MEDIA_TYPE.toString());
+ xmpMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
+ xmpMetadata.set(PDF.XMP_LOCATION, location);
+ if (embeddedDocumentExtractor.shouldParseEmbedded(xmpMetadata)) {
+ try {
+ parseMetadata(is, xmpMetadata);
+ } finally {
+ org.apache.tika.io.IOUtils.closeQuietly(is);
+ }
+ }
+
+ }
+
private void parseMetadata(InputStream stream, Metadata embeddedMetadata) throws IOException, SAXException {
try {
embeddedDocumentExtractor.parseEmbedded(
@@ -441,6 +464,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
metadata.add(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE,
unmappedUnicodeCharsPerPage);
+
try {
for (PDAnnotation annotation : page.getAnnotations()) {