You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2020/09/20 23:03:56 UTC

[tika] branch branch_1x updated: TIKA-3188: Add Adobe Indesign IDML Parser

This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 56ad418  TIKA-3188: Add Adobe Indesign IDML Parser
56ad418 is described below

commit 56ad41892036dbd75e5fe8ebb34100c8aafde757
Author: David Meikle <dm...@apache.org>
AuthorDate: Mon Sep 21 00:02:13 2020 +0100

    TIKA-3188: Add Adobe Indesign IDML Parser
---
 .../org/apache/tika/mime/tika-mimetypes.xml        |   8 +-
 tika-parsers/pom.xml                               |   6 +
 .../indesign/ContentAndMetadataExtractor.java      | 108 ++++++++++++
 .../apache/tika/parser/indesign/IDMLParser.java    | 187 +++++++++++++++++++++
 .../parser/indesign/xmp/XMPMetadataExtractor.java  | 162 ++++++++++++++++++
 .../org/apache/tika/parser/pkg/PackageParser.java  |   1 +
 .../services/org.apache.tika.parser.Parser         |   1 +
 .../tika/parser/indesign/IDMLParserTest.java       |  62 +++++++
 .../resources/test-documents/testIndesign.idml     | Bin 0 -> 30574 bytes
 9 files changed, 534 insertions(+), 1 deletion(-)

diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index a92ab22..2acb8ea 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -60,7 +60,6 @@
 	  <sub-class-of type="application/xml"/>
    </mime-type>
 
-
   <mime-type type="application/atom+xml">
     <root-XML localName="feed" namespaceURI="http://purl.org/atom/ns#"/>
     <root-XML localName="feed" namespaceURI="http://www.w3.org/2005/Atom"/>
@@ -280,6 +279,13 @@
   </mime-type>
 
   <mime-type type="application/im-iscomposing+xml"/>
+
+  <mime-type type="application/vnd.adobe.indesign-idml-package">
+    <sub-class-of type="application/zip"/>
+    <_comment>IDML</_comment>
+    <glob pattern="*.idml"/>
+  </mime-type>
+
   <mime-type type="application/index"/>
   <mime-type type="application/index.cmd"/>
   <mime-type type="application/index.obj"/>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 09b97ce..15a80db 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -49,6 +49,7 @@
     <vorbis.version>0.8</vorbis.version>
     <pdfbox.version>2.0.21</pdfbox.version>
     <jempbox.version>1.8.16</jempbox.version>
+    <xmpbox.version>2.0.21</xmpbox.version>
     <netcdf-java.version>4.5.5</netcdf-java.version>
     <sis.version>1.0</sis.version>
     <parso.version>2.0.12</parso.version>
@@ -213,6 +214,11 @@
       <artifactId>jempbox</artifactId>
       <version>${jempbox.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.pdfbox</groupId>
+      <artifactId>xmpbox</artifactId>
+      <version>${xmpbox.version}</version>
+    </dependency>
     <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
          as optional, but we prefer to have them always to avoid
          problems with encrypted PDFs. -->
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/indesign/ContentAndMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/indesign/ContentAndMetadataExtractor.java
new file mode 100644
index 0000000..d6947b2
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/indesign/ContentAndMetadataExtractor.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.indesign;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Extractor for InDesign Content and Metadata.
+ */
+class ContentAndMetadataExtractor {
+
+    private final static Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
+    /**
+     * Extract the InDesign Story Content and emit to the <code>XHTMLContentHandler</code>.
+     *
+     * @param stream the document stream (input)
+     * @param handler handler for the XHTML SAX events (output)
+     * @param metadata document metadata (input and output)
+     * @param context parse context
+     * @throws IOException if the document stream could not be read
+     * @throws SAXException if the SAX events could not be processed
+     * @throws TikaException if the document could not be parsed
+     */
+    static void extract(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        // Parse the content using inner content handler
+        XMLReaderUtils.parseSAX(
+                new CloseShieldInputStream(stream), new ContentAndMetadataHandler(handler, metadata), context
+        );
+    }
+
+    /**
+     * Content handler for InDesign Content and Metadata.
+     */
+    static class ContentAndMetadataHandler extends DefaultHandler {
+
+        private final ContentHandler handler;
+        private final Metadata metadata;
+        private boolean inContent = false;
+
+        ContentAndMetadataHandler(ContentHandler handler, Metadata metadata) {
+            this.handler = handler;
+            this.metadata = metadata;
+        }
+
+        public void startElement(
+                String uri, String localName, String qName, Attributes attributes)
+                throws SAXException {
+
+            // Get Spread Metadata
+            if ("Spread".equals(localName) || "MasterSpread".equals(localName)) {
+                metadata.add("PageCount", attributes.getValue("PageCount"));
+            }
+
+            // Trigger processing of content from Spread or Stories
+            if ("Content".equals(localName)) {
+                inContent = true;
+                handler.startElement(XHTMLContentHandler.XHTML, "p", "p", EMPTY_ATTRIBUTES);
+            }
+        }
+
+        @Override
+        public void characters(char[] ch, int start, int length) throws SAXException {
+            if (inContent) {
+                handler.characters(ch, start, length);
+            }
+        }
+
+        @Override
+        public void endElement(String uri, String localName, String qName) throws SAXException {
+            if ("Content".equals(localName)) {
+                inContent = false;
+                handler.endElement(XHTMLContentHandler.XHTML, "p", "p");
+            }
+        }
+    }
+
+
+}
\ No newline at end of file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/indesign/IDMLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/indesign/IDMLParser.java
new file mode 100644
index 0000000..6832594
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/indesign/IDMLParser.java
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.indesign;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.indesign.xmp.XMPMetadataExtractor;
+import org.apache.tika.sax.EndDocumentShieldingContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipInputStream;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Adobe InDesign IDML Parser.
+ */
+public class IDMLParser extends AbstractParser {
+
+    /**
+     * IDML MimeType
+     */
+    private static final MediaType IDML_CONTENT_TYPE
+            = MediaType.application("vnd.adobe.indesign-idml-package");
+
+    /**
+     * Supported types set.
+     */
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(IDML_CONTENT_TYPE);
+
+    /**
+     * Metadata file name.
+     */
+    private static final String META_NAME = "META-INF/metadata.xml";
+
+    /**
+     * Internal page count.
+     */
+    private int pageCount = 0;
+
+    /**
+     * Internal master spread count.
+     */
+    private int masterSpreadCount = 0;
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        ZipFile zipFile = null;
+        ZipInputStream zipStream = null;
+        if (stream instanceof TikaInputStream) {
+            TikaInputStream tis = (TikaInputStream) stream;
+            Object container = ((TikaInputStream) stream).getOpenContainer();
+            if (container instanceof ZipFile) {
+                zipFile = (ZipFile) container;
+            } else if (tis.hasFile()) {
+                zipFile = new ZipFile(tis.getFile());
+            } else {
+                zipStream = new ZipInputStream(stream);
+            }
+        } else {
+            zipStream = new ZipInputStream(stream);
+        }
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
+        xhtml.startDocument();
+        EndDocumentShieldingContentHandler handler = new EndDocumentShieldingContentHandler(xhtml);
+
+        if (zipFile != null) {
+            try {
+                handleZipFile(zipFile, metadata, context, handler);
+            } finally {
+                zipFile.close();
+            }
+        } else {
+            try {
+                handleZipStream(zipStream, metadata, context, handler);
+            } finally {
+                zipStream.close();
+            }
+        }
+
+        metadata.set("SpreadPageCount", Integer.toString(pageCount));
+        metadata.set("MasterSpreadPageCount", Integer.toString(masterSpreadCount));
+        metadata.set("TotalPageCount", Integer.toString(pageCount + masterSpreadCount));
+
+        xhtml.endDocument();
+
+        if (handler.getEndDocumentWasCalled()) {
+            handler.reallyEndDocument();
+        }
+    }
+
+    private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context,
+                                 ContentHandler handler) throws IOException, TikaException, SAXException {
+        ZipEntry entry = zipStream.getNextEntry();
+        if (entry == null) {
+            throw new IOException("No entries found in ZipInputStream");
+        }
+        do {
+            handleZipEntry(entry, zipStream, metadata, context, handler);
+            entry = zipStream.getNextEntry();
+        } while (entry != null);
+    }
+
+    private void handleZipFile(ZipFile zipFile, Metadata metadata, ParseContext context, ContentHandler handler)
+            throws IOException, TikaException, SAXException {
+
+        ZipEntry entry = zipFile.getEntry(META_NAME);
+        if (entry != null) {
+            handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+        }
+
+        Enumeration<? extends ZipEntry> entries = zipFile.entries();
+        while (entries.hasMoreElements()) {
+            entry = entries.nextElement();
+            if (!META_NAME.equals(entry.getName())) {
+                handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+            }
+        }
+    }
+
+    private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
+                                ParseContext context, ContentHandler handler)
+            throws IOException, SAXException, TikaException {
+
+        if (entry == null) {
+            return;
+        }
+
+        if (entry.getName().equals("mimetype")) {
+            String type = IOUtils.toString(zip, UTF_8);
+            metadata.set(Metadata.CONTENT_TYPE, type);
+        } else if (entry.getName().equals("META-INF/metadata.xml")) {
+            XMPMetadataExtractor.parse(zip, metadata);
+        } else if (entry.getName().contains("MasterSpreads")) {
+            Metadata embeddedMeta = new Metadata();
+            ContentAndMetadataExtractor.extract(zip, handler, embeddedMeta, context);
+            int spreadCount = Integer.parseInt(embeddedMeta.get("PageCount"));
+            masterSpreadCount += spreadCount;
+        } else if (entry.getName().contains("Spreads/Spread")) {
+            Metadata embeddedMeta = new Metadata();
+            ContentAndMetadataExtractor.extract(zip, handler, embeddedMeta, context);
+            int spreadCount = Integer.parseInt(embeddedMeta.get("PageCount"));
+            pageCount += spreadCount;
+        }  else if (entry.getName().contains("Stories")) {
+            ContentAndMetadataExtractor.extract(zip, handler, new Metadata(), context);
+        }
+
+    }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/indesign/xmp/XMPMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/indesign/xmp/XMPMetadataExtractor.java
new file mode 100644
index 0000000..1cb11ee
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/indesign/xmp/XMPMetadataExtractor.java
@@ -0,0 +1,162 @@
+package org.apache.tika.parser.indesign.xmp;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.XMP;
+import org.apache.xmpbox.XMPMetadata;
+import org.apache.xmpbox.schema.DublinCoreSchema;
+import org.apache.xmpbox.schema.XMPBasicSchema;
+import org.apache.xmpbox.xml.DomXmpParser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+import java.util.List;
+
+/**
+ * XMP Metadata Extractor based on Apache XmpBox.
+ */
+public class XMPMetadataExtractor {
+
+    /**
+     * Parse the XMP Packets.
+     *
+     * @param stream the stream to parser.
+     * @param metadata the metadata collection to update
+     * @throws IOException on any IO error.
+     * @throws TikaException on any Tika error.
+     */
+    public static void parse(InputStream stream, Metadata metadata) throws IOException, TikaException {
+        XMPMetadata xmp;
+        try {
+            DomXmpParser xmpParser = new DomXmpParser();
+            xmpParser.setStrictParsing(false);
+            xmp = xmpParser.parse(new CloseShieldInputStream(stream));
+        } catch (Throwable ex) {
+            //swallow
+            return;
+        }
+        extractDublinCoreSchema(xmp, metadata);
+        extractXMPBasicSchema(xmp, metadata);
+    }
+
+    /**
+     * Extracts Dublin Core.
+     *
+     * Silently swallows exceptions.
+     * @param xmp the XMP Metadata object.
+     * @param metadata the metadata map
+     */
+    public static void extractDublinCoreSchema(XMPMetadata xmp, Metadata metadata) throws IOException {
+        if (xmp == null) {
+            return;
+        }
+        DublinCoreSchema schemaDublinCore;
+        try {
+            schemaDublinCore = xmp.getDublinCoreSchema();
+        } catch (Throwable e) {
+            // Swallow
+            return;
+        }
+        if (schemaDublinCore != null) {
+            addMetadata(metadata, DublinCore.TITLE, schemaDublinCore.getTitle());
+            addMetadata(metadata, DublinCore.FORMAT, schemaDublinCore.getFormat());
+            addMetadata(metadata, DublinCore.DESCRIPTION, schemaDublinCore.getDescription());
+            addMetadata(metadata, DublinCore.CREATOR, schemaDublinCore.getCreators());
+            addMetadata(metadata, DublinCore.SUBJECT, schemaDublinCore.getSubjects());
+        }
+    }
+
+    /**
+     * Extracts basic schema metadata from XMP.
+     *
+     * Silently swallows exceptions.
+     * @param xmp the XMP Metadata object.
+     * @param metadata the metadata map
+     */
+    public static void extractXMPBasicSchema(XMPMetadata xmp, Metadata metadata) throws IOException {
+        if (xmp == null) {
+            return;
+        }
+        XMPBasicSchema schemaBasic;
+        try {
+            schemaBasic = xmp.getXMPBasicSchema();
+        } catch (Throwable e) {
+            // Swallow
+            return;
+        }
+        if (schemaBasic != null) {
+            addMetadata(metadata, XMP.CREATOR_TOOL, schemaBasic.getCreatorTool());
+            addMetadata(metadata, XMP.CREATE_DATE, schemaBasic.getCreateDate().getTime());
+            addMetadata(metadata, XMP.MODIFY_DATE, schemaBasic.getModifyDate().getTime());
+            addMetadata(metadata, XMP.METADATA_DATE, schemaBasic.getModifyDate().getTime());
+            addMetadata(metadata, XMP.RATING, schemaBasic.getRating());
+        }
+    }
+
+    /**
+     * Add list to the metadata map.
+     *
+     * @param metadata the metadata map to update.
+     * @param property the property to add.
+     * @param values the values to add.
+     */
+    private static void addMetadata(Metadata metadata, Property property, List<String> values) {
+        if (values != null) {
+            for (String value : values) {
+                addMetadata(metadata, property, value);
+            }
+        }
+    }
+
+    /**
+     * Add value to the metadata map.
+     *
+     * @param metadata the metadata map to update.
+     * @param property the property to add.
+     * @param value the value to add.
+     */
+    private static void addMetadata(Metadata metadata, Property property, String value) {
+        if (value != null) {
+            if (property.isMultiValuePermitted()) {
+                metadata.add(property, value);
+            } else {
+                metadata.set(property, value);
+            }
+        }
+    }
+
+    /**
+     * Add value to the metadata map.
+     *
+     * @param metadata the metadata map to update.
+     * @param property the property to add.
+     * @param value the value to add.
+     */
+    private static void addMetadata(Metadata metadata, Property property, Integer value) {
+        if (value != null) {
+            if (property.isMultiValuePermitted()) {
+                metadata.add(property, value);
+            } else {
+                metadata.set(property, value);
+            }
+        }
+    }
+
+    /**
+     * Add value to the metadata map.
+     *
+     * @param metadata the metadata map to update.
+     * @param property the property to add.
+     * @param value the value to add.
+     */
+    private static void addMetadata(Metadata metadata, Property property, Date value) {
+        if (value != null) {
+            metadata.set(property, value);
+        }
+    }
+
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 7a7735d..ef112c9 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -166,6 +166,7 @@ public class PackageParser extends AbstractParser {
                 "application/vnd.sun.xml.draw",
                 "application/vnd.sun.xml.impress",
                 "application/vnd.openofficeorg.autotext",
+                "application/vnd.adobe.indesign-idml-package",
 
 
                 "application/x-gtar" //specialization of tar
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 45f20e7..b0a0d49 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -79,6 +79,7 @@ org.apache.tika.parser.gdal.GDALParser
 org.apache.tika.parser.pot.PooledTimeSeriesParser
 org.apache.tika.parser.grib.GribParser
 org.apache.tika.parser.jdbc.SQLite3Parser
+org.apache.tika.parser.indesign.IDMLParser
 org.apache.tika.parser.isatab.ISArchiveParser
 org.apache.tika.parser.geoinfo.GeographicInformationParser
 org.apache.tika.parser.geo.topic.GeoParser
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/indesign/IDMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/indesign/IDMLParserTest.java
new file mode 100644
index 0000000..ee1495a
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/indesign/IDMLParserTest.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.indesign;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.XMP;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Test case for the IDML Parser.
+ */
+public class IDMLParserTest extends TikaTest {
+
+    /**
+     * Shared IDMLParser instance.
+     */
+    private final Parser parser = new IDMLParser();
+
+    @Test
+    public void testParserToText() throws Exception {
+        Metadata metadata = new Metadata();
+        String content = getText("testIndesign.idml", parser, metadata);
+        assertEquals("3", metadata.get("TotalPageCount"));
+        assertEquals("2", metadata.get("MasterSpreadPageCount"));
+        assertEquals("1", metadata.get("SpreadPageCount"));
+        assertEquals("application/vnd.adobe.indesign-idml-package", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("2020-09-20T20:07:44Z", metadata.get(XMP.CREATE_DATE));
+        assertEquals("2020-09-20T20:07:44Z", metadata.get(XMP.MODIFY_DATE));
+        assertEquals("Adobe InDesign CC 14.0 (Windows)", metadata.get(XMP.CREATOR_TOOL));
+        assertContains("Lorem ipsum dolor sit amet, consectetur adipiscing elit", content);
+    }
+
+    @Test
+    public void testParserToXML() throws Exception {
+        Metadata metadata = new Metadata();
+        String xml = getXML("testIndesign.idml", parser, metadata).xml;
+        assertEquals("Adobe InDesign CC 14.0 (Windows)", metadata.get(XMP.CREATOR_TOOL));
+        assertEquals("3", metadata.get("TotalPageCount"));
+        assertContains("<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit", xml);
+        assertContains("<meta name=\"xmp:CreatorTool\" content=\"Adobe InDesign CC 14.0 (Windows)\" />", xml);
+    }
+
+}
diff --git a/tika-parsers/src/test/resources/test-documents/testIndesign.idml b/tika-parsers/src/test/resources/test-documents/testIndesign.idml
new file mode 100644
index 0000000..3f497b3
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testIndesign.idml differ