You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2020/09/20 23:03:56 UTC
[tika] branch branch_1x updated: TIKA-3188: Add Adobe Indesign IDML
Parser
This is an automated email from the ASF dual-hosted git repository.
dmeikle pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 56ad418 TIKA-3188: Add Adobe Indesign IDML Parser
56ad418 is described below
commit 56ad41892036dbd75e5fe8ebb34100c8aafde757
Author: David Meikle <dm...@apache.org>
AuthorDate: Mon Sep 21 00:02:13 2020 +0100
TIKA-3188: Add Adobe Indesign IDML Parser
---
.../org/apache/tika/mime/tika-mimetypes.xml | 8 +-
tika-parsers/pom.xml | 6 +
.../indesign/ContentAndMetadataExtractor.java | 108 ++++++++++++
.../apache/tika/parser/indesign/IDMLParser.java | 187 +++++++++++++++++++++
.../parser/indesign/xmp/XMPMetadataExtractor.java | 162 ++++++++++++++++++
.../org/apache/tika/parser/pkg/PackageParser.java | 1 +
.../services/org.apache.tika.parser.Parser | 1 +
.../tika/parser/indesign/IDMLParserTest.java | 62 +++++++
.../resources/test-documents/testIndesign.idml | Bin 0 -> 30574 bytes
9 files changed, 534 insertions(+), 1 deletion(-)
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index a92ab22..2acb8ea 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -60,7 +60,6 @@
<sub-class-of type="application/xml"/>
</mime-type>
-
<mime-type type="application/atom+xml">
<root-XML localName="feed" namespaceURI="http://purl.org/atom/ns#"/>
<root-XML localName="feed" namespaceURI="http://www.w3.org/2005/Atom"/>
@@ -280,6 +279,13 @@
</mime-type>
<mime-type type="application/im-iscomposing+xml"/>
+
+ <mime-type type="application/vnd.adobe.indesign-idml-package">
+ <sub-class-of type="application/zip"/>
+ <_comment>IDML</_comment>
+ <glob pattern="*.idml"/>
+ </mime-type>
+
<mime-type type="application/index"/>
<mime-type type="application/index.cmd"/>
<mime-type type="application/index.obj"/>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 09b97ce..15a80db 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -49,6 +49,7 @@
<vorbis.version>0.8</vorbis.version>
<pdfbox.version>2.0.21</pdfbox.version>
<jempbox.version>1.8.16</jempbox.version>
+ <xmpbox.version>2.0.21</xmpbox.version>
<netcdf-java.version>4.5.5</netcdf-java.version>
<sis.version>1.0</sis.version>
<parso.version>2.0.12</parso.version>
@@ -213,6 +214,11 @@
<artifactId>jempbox</artifactId>
<version>${jempbox.version}</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>xmpbox</artifactId>
+ <version>${xmpbox.version}</version>
+ </dependency>
<!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
as optional, but we prefer to have them always to avoid
problems with encrypted PDFs. -->
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/indesign/ContentAndMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/indesign/ContentAndMetadataExtractor.java
new file mode 100644
index 0000000..d6947b2
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/indesign/ContentAndMetadataExtractor.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.indesign;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Extractor for InDesign Content and Metadata.
+ */
+class ContentAndMetadataExtractor {
+
+ private final static Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
+ /**
+ * Extract the InDesign Story Content and emit to the <code>XHTMLContentHandler</code>.
+ *
+ * @param stream the document stream (input)
+ * @param handler handler for the XHTML SAX events (output)
+ * @param metadata document metadata (input and output)
+ * @param context parse context
+ * @throws IOException if the document stream could not be read
+ * @throws SAXException if the SAX events could not be processed
+ * @throws TikaException if the document could not be parsed
+ */
+ static void extract(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ // Parse the content using inner content handler
+ XMLReaderUtils.parseSAX(
+ new CloseShieldInputStream(stream), new ContentAndMetadataHandler(handler, metadata), context
+ );
+ }
+
+ /**
+ * Content handler for InDesign Content and Metadata.
+ */
+ static class ContentAndMetadataHandler extends DefaultHandler {
+
+ private final ContentHandler handler;
+ private final Metadata metadata;
+ private boolean inContent = false;
+
+ ContentAndMetadataHandler(ContentHandler handler, Metadata metadata) {
+ this.handler = handler;
+ this.metadata = metadata;
+ }
+
+ public void startElement(
+ String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
+
+ // Get Spread Metadata
+ if ("Spread".equals(localName) || "MasterSpread".equals(localName)) {
+ metadata.add("PageCount", attributes.getValue("PageCount"));
+ }
+
+ // Trigger processing of content from Spread or Stories
+ if ("Content".equals(localName)) {
+ inContent = true;
+ handler.startElement(XHTMLContentHandler.XHTML, "p", "p", EMPTY_ATTRIBUTES);
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (inContent) {
+ handler.characters(ch, start, length);
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ if ("Content".equals(localName)) {
+ inContent = false;
+ handler.endElement(XHTMLContentHandler.XHTML, "p", "p");
+ }
+ }
+ }
+
+
+}
\ No newline at end of file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/indesign/IDMLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/indesign/IDMLParser.java
new file mode 100644
index 0000000..6832594
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/indesign/IDMLParser.java
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.indesign;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.indesign.xmp.XMPMetadataExtractor;
+import org.apache.tika.sax.EndDocumentShieldingContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipInputStream;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Adobe InDesign IDML Parser.
+ */
+public class IDMLParser extends AbstractParser {
+
+ /**
+ * IDML MimeType
+ */
+ private static final MediaType IDML_CONTENT_TYPE
+ = MediaType.application("vnd.adobe.indesign-idml-package");
+
+ /**
+ * Supported types set.
+ */
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(IDML_CONTENT_TYPE);
+
+ /**
+ * Metadata file name.
+ */
+ private static final String META_NAME = "META-INF/metadata.xml";
+
+ /**
+ * Internal page count.
+ */
+ private int pageCount = 0;
+
+ /**
+ * Internal master spread count.
+ */
+ private int masterSpreadCount = 0;
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ ZipFile zipFile = null;
+ ZipInputStream zipStream = null;
+ if (stream instanceof TikaInputStream) {
+ TikaInputStream tis = (TikaInputStream) stream;
+ Object container = ((TikaInputStream) stream).getOpenContainer();
+ if (container instanceof ZipFile) {
+ zipFile = (ZipFile) container;
+ } else if (tis.hasFile()) {
+ zipFile = new ZipFile(tis.getFile());
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
+ xhtml.startDocument();
+ EndDocumentShieldingContentHandler handler = new EndDocumentShieldingContentHandler(xhtml);
+
+ if (zipFile != null) {
+ try {
+ handleZipFile(zipFile, metadata, context, handler);
+ } finally {
+ zipFile.close();
+ }
+ } else {
+ try {
+ handleZipStream(zipStream, metadata, context, handler);
+ } finally {
+ zipStream.close();
+ }
+ }
+
+ metadata.set("SpreadPageCount", Integer.toString(pageCount));
+ metadata.set("MasterSpreadPageCount", Integer.toString(masterSpreadCount));
+ metadata.set("TotalPageCount", Integer.toString(pageCount + masterSpreadCount));
+
+ xhtml.endDocument();
+
+ if (handler.getEndDocumentWasCalled()) {
+ handler.reallyEndDocument();
+ }
+ }
+
+ private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context,
+ ContentHandler handler) throws IOException, TikaException, SAXException {
+ ZipEntry entry = zipStream.getNextEntry();
+ if (entry == null) {
+ throw new IOException("No entries found in ZipInputStream");
+ }
+ do {
+ handleZipEntry(entry, zipStream, metadata, context, handler);
+ entry = zipStream.getNextEntry();
+ } while (entry != null);
+ }
+
+ private void handleZipFile(ZipFile zipFile, Metadata metadata, ParseContext context, ContentHandler handler)
+ throws IOException, TikaException, SAXException {
+
+ ZipEntry entry = zipFile.getEntry(META_NAME);
+ if (entry != null) {
+ handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+ }
+
+ Enumeration<? extends ZipEntry> entries = zipFile.entries();
+ while (entries.hasMoreElements()) {
+ entry = entries.nextElement();
+ if (!META_NAME.equals(entry.getName())) {
+ handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+ }
+ }
+ }
+
+ private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
+ ParseContext context, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+
+ if (entry == null) {
+ return;
+ }
+
+ if (entry.getName().equals("mimetype")) {
+ String type = IOUtils.toString(zip, UTF_8);
+ metadata.set(Metadata.CONTENT_TYPE, type);
+ } else if (entry.getName().equals("META-INF/metadata.xml")) {
+ XMPMetadataExtractor.parse(zip, metadata);
+ } else if (entry.getName().contains("MasterSpreads")) {
+ Metadata embeddedMeta = new Metadata();
+ ContentAndMetadataExtractor.extract(zip, handler, embeddedMeta, context);
+ int spreadCount = Integer.parseInt(embeddedMeta.get("PageCount"));
+ masterSpreadCount += spreadCount;
+ } else if (entry.getName().contains("Spreads/Spread")) {
+ Metadata embeddedMeta = new Metadata();
+ ContentAndMetadataExtractor.extract(zip, handler, embeddedMeta, context);
+ int spreadCount = Integer.parseInt(embeddedMeta.get("PageCount"));
+ pageCount += spreadCount;
+ } else if (entry.getName().contains("Stories")) {
+ ContentAndMetadataExtractor.extract(zip, handler, new Metadata(), context);
+ }
+
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/indesign/xmp/XMPMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/indesign/xmp/XMPMetadataExtractor.java
new file mode 100644
index 0000000..1cb11ee
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/indesign/xmp/XMPMetadataExtractor.java
@@ -0,0 +1,162 @@
+package org.apache.tika.parser.indesign.xmp;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.XMP;
+import org.apache.xmpbox.XMPMetadata;
+import org.apache.xmpbox.schema.DublinCoreSchema;
+import org.apache.xmpbox.schema.XMPBasicSchema;
+import org.apache.xmpbox.xml.DomXmpParser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+import java.util.List;
+
+/**
+ * XMP Metadata Extractor based on Apache XmpBox.
+ */
+public class XMPMetadataExtractor {
+
+ /**
+ * Parse the XMP Packets.
+ *
+ * @param stream the stream to parser.
+ * @param metadata the metadata collection to update
+ * @throws IOException on any IO error.
+ * @throws TikaException on any Tika error.
+ */
+ public static void parse(InputStream stream, Metadata metadata) throws IOException, TikaException {
+ XMPMetadata xmp;
+ try {
+ DomXmpParser xmpParser = new DomXmpParser();
+ xmpParser.setStrictParsing(false);
+ xmp = xmpParser.parse(new CloseShieldInputStream(stream));
+ } catch (Throwable ex) {
+ //swallow
+ return;
+ }
+ extractDublinCoreSchema(xmp, metadata);
+ extractXMPBasicSchema(xmp, metadata);
+ }
+
+ /**
+ * Extracts Dublin Core.
+ *
+ * Silently swallows exceptions.
+ * @param xmp the XMP Metadata object.
+ * @param metadata the metadata map
+ */
+ public static void extractDublinCoreSchema(XMPMetadata xmp, Metadata metadata) throws IOException {
+ if (xmp == null) {
+ return;
+ }
+ DublinCoreSchema schemaDublinCore;
+ try {
+ schemaDublinCore = xmp.getDublinCoreSchema();
+ } catch (Throwable e) {
+ // Swallow
+ return;
+ }
+ if (schemaDublinCore != null) {
+ addMetadata(metadata, DublinCore.TITLE, schemaDublinCore.getTitle());
+ addMetadata(metadata, DublinCore.FORMAT, schemaDublinCore.getFormat());
+ addMetadata(metadata, DublinCore.DESCRIPTION, schemaDublinCore.getDescription());
+ addMetadata(metadata, DublinCore.CREATOR, schemaDublinCore.getCreators());
+ addMetadata(metadata, DublinCore.SUBJECT, schemaDublinCore.getSubjects());
+ }
+ }
+
+ /**
+ * Extracts basic schema metadata from XMP.
+ *
+ * Silently swallows exceptions.
+ * @param xmp the XMP Metadata object.
+ * @param metadata the metadata map
+ */
+ public static void extractXMPBasicSchema(XMPMetadata xmp, Metadata metadata) throws IOException {
+ if (xmp == null) {
+ return;
+ }
+ XMPBasicSchema schemaBasic;
+ try {
+ schemaBasic = xmp.getXMPBasicSchema();
+ } catch (Throwable e) {
+ // Swallow
+ return;
+ }
+ if (schemaBasic != null) {
+ addMetadata(metadata, XMP.CREATOR_TOOL, schemaBasic.getCreatorTool());
+ addMetadata(metadata, XMP.CREATE_DATE, schemaBasic.getCreateDate().getTime());
+ addMetadata(metadata, XMP.MODIFY_DATE, schemaBasic.getModifyDate().getTime());
+ addMetadata(metadata, XMP.METADATA_DATE, schemaBasic.getModifyDate().getTime());
+ addMetadata(metadata, XMP.RATING, schemaBasic.getRating());
+ }
+ }
+
+ /**
+ * Add list to the metadata map.
+ *
+ * @param metadata the metadata map to update.
+ * @param property the property to add.
+ * @param values the values to add.
+ */
+ private static void addMetadata(Metadata metadata, Property property, List<String> values) {
+ if (values != null) {
+ for (String value : values) {
+ addMetadata(metadata, property, value);
+ }
+ }
+ }
+
+ /**
+ * Add value to the metadata map.
+ *
+ * @param metadata the metadata map to update.
+ * @param property the property to add.
+ * @param value the value to add.
+ */
+ private static void addMetadata(Metadata metadata, Property property, String value) {
+ if (value != null) {
+ if (property.isMultiValuePermitted()) {
+ metadata.add(property, value);
+ } else {
+ metadata.set(property, value);
+ }
+ }
+ }
+
+ /**
+ * Add value to the metadata map.
+ *
+ * @param metadata the metadata map to update.
+ * @param property the property to add.
+ * @param value the value to add.
+ */
+ private static void addMetadata(Metadata metadata, Property property, Integer value) {
+ if (value != null) {
+ if (property.isMultiValuePermitted()) {
+ metadata.add(property, value);
+ } else {
+ metadata.set(property, value);
+ }
+ }
+ }
+
+ /**
+ * Add value to the metadata map.
+ *
+ * @param metadata the metadata map to update.
+ * @param property the property to add.
+ * @param value the value to add.
+ */
+ private static void addMetadata(Metadata metadata, Property property, Date value) {
+ if (value != null) {
+ metadata.set(property, value);
+ }
+ }
+
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 7a7735d..ef112c9 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -166,6 +166,7 @@ public class PackageParser extends AbstractParser {
"application/vnd.sun.xml.draw",
"application/vnd.sun.xml.impress",
"application/vnd.openofficeorg.autotext",
+ "application/vnd.adobe.indesign-idml-package",
"application/x-gtar" //specialization of tar
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 45f20e7..b0a0d49 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -79,6 +79,7 @@ org.apache.tika.parser.gdal.GDALParser
org.apache.tika.parser.pot.PooledTimeSeriesParser
org.apache.tika.parser.grib.GribParser
org.apache.tika.parser.jdbc.SQLite3Parser
+org.apache.tika.parser.indesign.IDMLParser
org.apache.tika.parser.isatab.ISArchiveParser
org.apache.tika.parser.geoinfo.GeographicInformationParser
org.apache.tika.parser.geo.topic.GeoParser
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/indesign/IDMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/indesign/IDMLParserTest.java
new file mode 100644
index 0000000..ee1495a
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/indesign/IDMLParserTest.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.indesign;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.XMP;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Test case for the IDML Parser.
+ */
+public class IDMLParserTest extends TikaTest {
+
+ /**
+ * Shared IDMLParser instance.
+ */
+ private final Parser parser = new IDMLParser();
+
+ @Test
+ public void testParserToText() throws Exception {
+ Metadata metadata = new Metadata();
+ String content = getText("testIndesign.idml", parser, metadata);
+ assertEquals("3", metadata.get("TotalPageCount"));
+ assertEquals("2", metadata.get("MasterSpreadPageCount"));
+ assertEquals("1", metadata.get("SpreadPageCount"));
+ assertEquals("application/vnd.adobe.indesign-idml-package", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("2020-09-20T20:07:44Z", metadata.get(XMP.CREATE_DATE));
+ assertEquals("2020-09-20T20:07:44Z", metadata.get(XMP.MODIFY_DATE));
+ assertEquals("Adobe InDesign CC 14.0 (Windows)", metadata.get(XMP.CREATOR_TOOL));
+ assertContains("Lorem ipsum dolor sit amet, consectetur adipiscing elit", content);
+ }
+
+ @Test
+ public void testParserToXML() throws Exception {
+ Metadata metadata = new Metadata();
+ String xml = getXML("testIndesign.idml", parser, metadata).xml;
+ assertEquals("Adobe InDesign CC 14.0 (Windows)", metadata.get(XMP.CREATOR_TOOL));
+ assertEquals("3", metadata.get("TotalPageCount"));
+ assertContains("<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit", xml);
+ assertContains("<meta name=\"xmp:CreatorTool\" content=\"Adobe InDesign CC 14.0 (Windows)\" />", xml);
+ }
+
+}
diff --git a/tika-parsers/src/test/resources/test-documents/testIndesign.idml b/tika-parsers/src/test/resources/test-documents/testIndesign.idml
new file mode 100644
index 0000000..3f497b3
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testIndesign.idml differ