You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/12/13 15:29:56 UTC

[tika] 01/02: TIKA-2524 -- add an XPS parser

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 78c8d74a34ccf8ef5d49ba7a242687e1423ca952
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Dec 13 10:11:51 2017 -0500

    TIKA-2524 -- add an XPS parser
---
 CHANGES.txt                                        |   4 +
 .../parser/microsoft/ooxml/MetadataExtractor.java  |   4 +-
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |  15 +-
 .../tika/parser/microsoft/ooxml/OOXMLParser.java   |   8 +-
 .../microsoft/ooxml/xps/XPSExtractorDecorator.java | 262 ++++++++++++++
 .../microsoft/ooxml/xps/XPSPageContentHandler.java | 377 +++++++++++++++++++++
 .../microsoft/ooxml/xps/XPSTextExtractor.java      |  66 ++++
 .../tika/parser/pkg/ZipContainerDetector.java      |   8 +-
 .../parser/microsoft/ooxml/xps/XPSParserTest.java  |  84 +++++
 .../resources/test-documents/testXPS_various.xps   | Bin 0 -> 560946 bytes
 10 files changed, 822 insertions(+), 6 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 69307a9..9b27891 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,7 @@
+Release 2.0.0 - ???
+
+   * Add a parser for XPS (TIKA-2524).
+
 Release 1.17 - December 8, 2017
 
   ***NOTE: THIS IS THE LAST VERSION OF TIKA THAT WILL RUN
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index 21c6252..dbbb839 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -36,6 +36,7 @@ import org.apache.tika.metadata.PagedText;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.microsoft.SummaryExtractor;
+import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
 import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
@@ -61,7 +62,8 @@ public class MetadataExtractor {
         if (extractor.getDocument() != null ||
                 ((extractor instanceof XSSFEventBasedExcelExtractor ||
                         extractor instanceof XWPFEventBasedWordExtractor ||
-                        extractor instanceof XSLFEventBasedPowerPointExtractor) &&
+                        extractor instanceof XSLFEventBasedPowerPointExtractor ||
+                        extractor instanceof XPSTextExtractor) &&
                         extractor.getPackage() != null)) {
             extractMetadata(extractor.getCoreProperties(), metadata);
             extractMetadata(extractor.getExtendedProperties(), metadata);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index b6f7bf5..5230d65 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -45,6 +45,8 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.ooxml.xps.XPSExtractorDecorator;
+import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
 import org.apache.tika.parser.pkg.ZipContainerDetector;
@@ -66,7 +68,7 @@ public class OOXMLExtractorFactory {
         ExtractorFactory.setThreadPrefersEventExtractors(true);
 
         try {
-            OOXMLExtractor extractor;
+            OOXMLExtractor extractor = null;
             OPCPackage pkg;
 
             // Locate or Open the OPCPackage for the file
@@ -83,13 +85,16 @@ public class OOXMLExtractorFactory {
 
             // Get the type, and ensure it's one we handle
             MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
+            if (type == null) {
+                type = ZipContainerDetector.detectXPSOPC(pkg);
+            }
+
             if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
                 // Not a supported type, delegate to Empty Parser
                 EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
                 return;
             }
             metadata.set(Metadata.CONTENT_TYPE, type.toString());
-
             // Have the appropriate OOXML text extractor picked
             POIXMLTextExtractor poiExtractor = null;
             // This has already been set by OOXMLParser's call to configure()
@@ -101,6 +106,10 @@ public class OOXMLExtractorFactory {
             if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
                 poiExtractor = trySXSLF(pkg);
             }
+            if (type.equals(OOXMLParser.XPS)) {
+                poiExtractor = new XPSTextExtractor(pkg);
+            }
+
             if (poiExtractor == null) {
                 poiExtractor = ExtractorFactory.createExtractor(pkg);
             }
@@ -119,6 +128,8 @@ public class OOXMLExtractorFactory {
                 extractor = new SXSLFPowerPointExtractorDecorator(metadata, context,
                         (XSLFEventBasedPowerPointExtractor) poiExtractor);
                 metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
+            } else if (poiExtractor instanceof XPSTextExtractor) {
+                extractor = new XPSExtractorDecorator(context, poiExtractor);
             } else if (document == null) {
                 throw new TikaException(
                         "Expecting UserModel based POI OOXML extractor with a document, but none found. " +
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
index fbc0f93..81ec4b6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
@@ -41,6 +41,8 @@ public class OOXMLParser extends AbstractOfficeParser {
         ZipSecureFile.setMinInflateRatio(-1.0d);
     }
 
+    protected static final MediaType XPS = MediaType.application("vnd.ms-xpsdocument");
+
     protected static final Set<MediaType> SUPPORTED_TYPES =
             Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
                     MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"),
@@ -84,9 +86,11 @@ public class OOXMLParser extends AbstractOfficeParser {
      * by Tika and/or POI.
      */
     protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES =
-            Collections.singleton(
+            Collections.EMPTY_SET;
+        //TODO: should we do a singleton for dwfx+xps?
+            /*Collections.singleton(
                     MediaType.application("vnd.ms-xpsdocument")
-            );
+            );*/
     /**
      * Serial version UID
      */
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
new file mode 100644
index 0000000..689db28
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
@@ -0,0 +1,262 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xps;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.ZipPackage;
+import org.apache.poi.openxml4j.util.ZipEntrySource;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.ExceptionUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.zip.ZipEntry;
+
+public class XPSExtractorDecorator extends AbstractOOXMLExtractor {
+
+    private static String XPS_DOCUMENT = "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
+
+    private final ParseContext context;
+    private final ZipPackage pkg;
+    Map<String, Metadata> embeddedImages = new HashMap<>();
+
+    public XPSExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor) throws TikaException {
+        super(context, extractor);
+        this.context = context;
+        if (extractor.getPackage() instanceof ZipPackage) {
+            this.pkg = (ZipPackage) extractor.getPackage();
+        } else {
+            throw new TikaException("OPCPackage must be a ZipPackage");
+        }
+    }
+
+    @Override
+    public POIXMLDocument getDocument() {
+        return null;
+    }
+
+
+    @Override
+    protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
+
+        PackageRelationshipCollection prc = pkg.getRelationshipsByType(XPS_DOCUMENT);
+        for (int i = 0; i < prc.size(); i++) {
+            PackageRelationship pr = prc.getRelationship(i);
+
+            //there should only be one.
+            //in the test file, this points to FixedDocSeq.fdseq
+            try {
+                handleDocuments(pr, xhtml);
+            } catch (TikaException e) {
+                throw new SAXException(e);
+            }
+        }
+
+        //now handle embedded images
+        if (embeddedImages.size() > 0) {
+            EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
+            for (Map.Entry<String, Metadata> embeddedImage : embeddedImages.entrySet()) {
+                String zipPath = embeddedImage.getKey();
+                Metadata metadata = embeddedImage.getValue();
+                    if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
+                        handleEmbeddedImage(
+                                zipPath,
+                                metadata,
+                                embeddedDocumentUtil,
+                                xhtml);
+                    }
+            }
+        }
+
+    }
+
+    private void handleEmbeddedImage(String zipPath, Metadata metadata,
+                                      EmbeddedDocumentUtil embeddedDocumentUtil,
+                                     XHTMLContentHandler xhtml) throws SAXException, IOException {
+        InputStream stream = null;
+        try {
+            stream = getZipStream(zipPath, pkg);
+        } catch (IOException|TikaException e) {
+            //store this exception in the parent's metadata
+            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
+            return;
+        }
+
+        try {
+            embeddedDocumentUtil.parseEmbedded(stream, xhtml, metadata, true);
+        } finally {
+            IOUtils.closeQuietly(stream);
+        }
+    }
+
+    private void handleDocuments(PackageRelationship packageRelationship,
+                                 XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
+
+        try (InputStream stream = pkg.getPart(packageRelationship).getInputStream()) {
+            context.getSAXParser().parse(
+                    new CloseShieldInputStream(stream),
+                    new OfflineContentHandler(new EmbeddedContentHandler(
+                            new FixedDocSeqHandler(xhtml))));
+        }
+    }
+
+    @Override
+    protected List<PackagePart> getMainDocumentParts() throws TikaException {
+        return Collections.EMPTY_LIST;
+    }
+
+    private class FixedDocSeqHandler extends DefaultHandler {
+        private final static String DOCUMENT_REFERENCE = "DocumentReference";
+        private final static String SOURCE = "Source";
+
+        private final XHTMLContentHandler xhtml;
+
+        private FixedDocSeqHandler(XHTMLContentHandler xhtml) {
+            this.xhtml = xhtml;
+        }
+
+        @Override
+        public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+            if (!DOCUMENT_REFERENCE.equals(localName)) {
+                return;
+            }
+            for (int i = 0; i < atts.getLength(); i++) {
+                String lName = atts.getLocalName(i);
+                if (SOURCE.equals(lName)) {
+                    handleDocumentRef(atts.getValue(i));
+                }
+            }
+        }
+
+        private void handleDocumentRef(String docRef) throws SAXException {
+            //docRef is a path to a FixedDocumentSequence document,
+            // e.g. /Documents/1/FixedDoc.fdoc
+
+            //relative root is /Documents/1 ..need this Pages...
+            String relativeRoot = null;
+            int i = docRef.lastIndexOf("/");
+            if (i > 0) {
+                relativeRoot = docRef.substring(0, i);
+            } else {
+                relativeRoot = "";
+            }
+            String zipPath = (docRef.startsWith("/") ? docRef.substring(1) : docRef);
+            if (pkg instanceof ZipPackage) {
+                try (InputStream stream = getZipStream(zipPath, pkg)) {
+                    context.getSAXParser().parse(
+                            new CloseShieldInputStream(stream),
+                            new OfflineContentHandler(new EmbeddedContentHandler(
+                                    new PageContentPartHandler(relativeRoot, xhtml))));
+
+                } catch (IOException | TikaException e) {
+                    throw new SAXException(new TikaException("IOException trying to read: " + docRef));
+                }
+            } else {
+                throw new SAXException(new TikaException("Package must be ZipPackage"));
+            }
+        }
+
+        private class PageContentPartHandler extends DefaultHandler {
+            private static final String PAGE_CONTENT = "PageContent";
+            private static final String SOURCE = "Source";
+
+            private final String relativeRoot;
+            private final XHTMLContentHandler xhtml;
+
+            private PageContentPartHandler(String relativeRoot, XHTMLContentHandler xhtml) {
+                this.relativeRoot = relativeRoot;
+                this.xhtml = xhtml;
+            }
+
+            @Override
+            public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+                if (!PAGE_CONTENT.equals(localName)) {
+                    return;
+                }
+                String pagePath = null;
+                for (int i = 0; i < atts.getLength(); i++) {
+                    if (SOURCE.equals(atts.getLocalName(i))) {
+                        pagePath = atts.getValue(i);
+                        break;
+                    }
+                }
+
+                if (pagePath != null) {
+                    if (!pagePath.startsWith("/")) {
+                        pagePath = relativeRoot + "/" + pagePath;
+                    }
+                    //trim initial /
+                    if (pagePath.startsWith("/")) {
+                        pagePath = pagePath.substring(1);
+                    }
+                    try (InputStream stream = getZipStream(pagePath, pkg)) {
+                        context.getSAXParser().parse(
+                                new CloseShieldInputStream(stream),
+                                new OfflineContentHandler(
+                                        new XPSPageContentHandler(xhtml, embeddedImages)
+                                )
+                        );
+                    } catch (TikaException | IOException e) {
+                        throw new SAXException(e);
+                    }
+                }
+
+            }
+        }
+    }
+
+    private static InputStream getZipStream(String zipPath, ZipPackage zipPackage) throws IOException, TikaException {
+        String targPath = (zipPath.length() > 1 && zipPath.startsWith("/") ? zipPath.substring(1) : zipPath);
+        ZipEntrySource zipEntrySource = zipPackage.getZipArchive();
+        Enumeration<? extends ZipEntry> zipEntryEnumeration = zipEntrySource.getEntries();
+        ZipEntry zipEntry = null;
+        while (zipEntryEnumeration.hasMoreElements()) {
+            ZipEntry ze = zipEntryEnumeration.nextElement();
+            if (ze.getName().equals(targPath)) {
+                zipEntry = ze;
+                break;
+            }
+        }
+        if (zipEntry == null) {
+            throw new TikaException("Couldn't find required zip entry: " + zipPath);
+        }
+        return zipEntrySource.getInputStream(zipEntry);
+    }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSPageContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSPageContentHandler.java
new file mode 100644
index 0000000..d18825d
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSPageContentHandler.java
@@ -0,0 +1,377 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml.xps;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+
+
+/**
+ * Handles an individual page.  For now, this marks up
+ * canvas entities in a &lt;div&gt; tag.  Based on the spec,
+ * it currently relies on order within the xml for order of output
+ * of text to xhtml.  We could do more complex processing of coordinates
+ * for bidi-languages, but the spec implies that we should be able
+ * to rely on storage order.
+ * <p/>
+ * As with our PDFParser, this currently dumps urls at the bottom of the page
+ * and does not attempt to calculate the correct anchor text.
+ * <p/>
+ * TODO: integrate table markup
+ */
+class XPSPageContentHandler extends DefaultHandler {
+
+    private static final String GLYPHS = "Glyphs";
+    private static final String CANVAS = "Canvas";
+    private static final String CLIP = "Clip";
+    private static final String NULL_CLIP = "NULL_CLIP";
+    private static final String UNICODE_STRING = "UnicodeString";
+    private static final String ORIGIN_X = "OriginX";
+    private static final String ORIGIN_Y = "OriginY";
+    private static final String BIDI_LEVEL = "BidiLevel";
+    private static final String INDICES = "Indices";
+    private static final String NAME = "Name";
+    private static final String PATH = "Path";
+    private static final String NAVIGATE_URI = "FixedPage.NavigateUri";
+    private static final String IMAGE_SOURCE = "ImageSource";
+    private static final String IMAGE_BRUSH = "ImageBrush";
+    private static final String AUTOMATION_PROPERITES_HELP_TEXT = "AutomationProperties.HelpText";
+
+    private static final String URL_DIV = "urls";
+    private static final String DIV = "div";
+    private static final String CLASS = "class";
+    private static final String PAGE = "page";
+    private static final String CANVAS_SAX = "canvas";
+    private static final String P = "p";
+    private static final String HREF = "href";
+    private static final String A = "a";
+
+
+    private final XHTMLContentHandler xhml;
+
+    //path in zip file for an image rendered on this page
+    private String imageSourcePathInZip = null;
+    //embedded images sometimes include full path info of original image
+    private String originalLocationOnDrive = null;
+
+    //buffer for the glyph runs within a given canvas
+    //in insertion order
+    private Map<String, List<GlyphRun>> canvases = new LinkedHashMap<>();
+
+    private Set<String> urls = new LinkedHashSet();
+    private Stack<String> canvasStack = new Stack<>();
+    private final Map<String, Metadata> embeddedInfos;
+    //sort based on y coordinate of first element in each row
+    //this requires every row to have at least one element
+    private static Comparator<? super List<GlyphRun>> ROW_SORTER = new Comparator<List<GlyphRun>>() {
+        @Override
+        public int compare(List<GlyphRun> o1, List<GlyphRun> o2) {
+            if (o1.get(0).originY < o2.get(0).originY) {
+                return -1;
+            } else if (o1.get(0).originY > o2.get(0).originY) {
+                return 1;
+            }
+            return 0;
+        }
+    };
+
+    public XPSPageContentHandler(XHTMLContentHandler xhtml, Map<String, Metadata> embeddedInfos) {
+        this.xhml = xhtml;
+        this.embeddedInfos = embeddedInfos;
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+        if (CANVAS.equals(localName)) {
+            String clip = getVal(CLIP, atts);
+            if (clip == null) {
+                canvasStack.push(NULL_CLIP);
+            } else {
+                canvasStack.push(clip);
+            }
+            return;
+        } else if (PATH.equals(localName)) {
+            //for now just grab them and dump them at the end of the page.
+            String url = getVal(NAVIGATE_URI, atts);
+            if (url != null) {
+                urls.add(url);
+            }
+            originalLocationOnDrive = getVal(AUTOMATION_PROPERITES_HELP_TEXT, atts);
+        } else if (IMAGE_BRUSH.equals(localName)) {
+            imageSourcePathInZip = getVal(IMAGE_SOURCE, atts);
+        }
+
+        if (!GLYPHS.equals(localName)) {
+            return;
+        }
+        String name = null;
+        Float originX = null;
+        Float originY = null;
+        String unicodeString = null;
+        Integer bidilevel = 1;
+        String indicesString = null;
+
+        for (int i = 0; i < atts.getLength(); i++) {
+            String lName = atts.getLocalName(i);
+            String value = atts.getValue(i);
+            value = (value == null) ? "" : value.trim();
+
+            if (ORIGIN_X.equals(lName) && value.length() > 0) {
+                try {
+                    originX = Float.parseFloat(atts.getValue(i));
+                } catch (NumberFormatException e) {
+                    throw new SAXException(e);
+                }
+            } else if (ORIGIN_Y.equals(lName) && value.length() > 0) {
+                try {
+                    originY = Float.parseFloat(atts.getValue(i));
+                } catch (NumberFormatException e) {
+                    throw new SAXException(e);
+                }
+            } else if (UNICODE_STRING.equals(lName)) {
+                unicodeString = atts.getValue(i);
+            } else if (BIDI_LEVEL.equals(lName) && value.length() > 0) {
+                try {
+                    bidilevel = Integer.parseInt(atts.getValue(i));
+                } catch (NumberFormatException e) {
+                    throw new SAXException(e);
+                }
+            } else if (INDICES.equals(lName)) {
+                indicesString = atts.getValue(i);
+            } else if (NAME.equals(lName)) {
+                name = value;
+            }
+        }
+        if (unicodeString != null) {
+            originX = (originX == null) ? Integer.MIN_VALUE : originX;
+            originY = (originY == null) ? Integer.MAX_VALUE : originY;
+            String currentCanvasClip = (canvasStack.size() > 0) ? canvasStack.peek() : NULL_CLIP;
+            List<GlyphRun> runs = canvases.get(currentCanvasClip);
+            if (runs == null) {
+                runs = new ArrayList<>();
+            }
+            runs.add(new GlyphRun(name, originY, originX, unicodeString, bidilevel, indicesString));
+            canvases.put(currentCanvasClip, runs);
+        }
+
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+        if (CANVAS.equals(localName)) {
+            if (! canvasStack.isEmpty()) {
+                canvasStack.pop();
+            }
+        } else if (PATH.equals(localName)) {
+            //this assumes that there cannot be a path within a path
+            //not sure if this is true or if we need to track path depth
+            if (imageSourcePathInZip != null) {
+                Metadata m = embeddedInfos.get(imageSourcePathInZip);
+                if (m == null) {
+                    m = new Metadata();
+                }
+                if (originalLocationOnDrive != null) {
+                    String val = m.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME);
+                    if (val == null) {
+                        m.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, originalLocationOnDrive);
+                    }
+                }
+                m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                        TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+                embeddedInfos.put(imageSourcePathInZip, m);
+            }
+            //reset
+            imageSourcePathInZip = null;
+            originalLocationOnDrive = null;
+        }
+    }
+    @Override
+    public void startDocument() throws SAXException {
+        xhml.startElement(DIV, CLASS, PAGE);
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+        writePage();
+        xhml.endElement(DIV);
+    }
+
+
+    private final void writePage() throws SAXException {
+        if (canvases.size() == 0) {
+            return;
+        }
+
+        for (Map.Entry<String, List<GlyphRun>> e : canvases.entrySet()) {
+            String clip = e.getKey();
+            List<GlyphRun> runs = e.getValue();
+            if (runs.size() == 0) {
+                continue;
+            }
+            xhml.startElement(DIV, CLASS, CANVAS_SAX);
+            //a list of rows sorted by the y of the first element in each row
+            List<List<GlyphRun>> rows = buildRows(runs);
+            for (List<GlyphRun> row : rows) {
+                writeRow(row);
+            }
+            xhml.endElement(DIV);
+        }
+        //for now just dump the urls at the end of the page
+        //At some point, we could link them back up to their
+        //true anchor text.
+        if (urls.size() > 0) {
+            xhml.startElement(DIV, CLASS, URL_DIV);
+            for (String u : urls) {
+                xhml.startElement(A, HREF, u);
+                xhml.characters(u);
+                xhml.endElement(A);
+            }
+            xhml.endElement(DIV);
+        }
+        canvases.clear();
+    }
+
+    private void writeRow(List<GlyphRun> row) throws SAXException {
+/*
+        int rtl = 0;
+        int ltr = 0;
+        //if the row is entirely rtl, sort all as rtl
+        //otherwise sort ltr
+        for (GlyphRun r : row) {
+            //ignore directionality of pure spaces
+            if (r.unicodeString == null || r.unicodeString.trim().length() == 0) {
+                continue;
+            }
+            if (r.direction == GlyphRun.DIRECTION.RTL) {
+                rtl++;
+            } else {
+                ltr++;
+            }
+        }
+        if (rtl > 0 && ltr == 0) {
+            Collections.sort(row, GlyphRun.RTL_COMPARATOR);
+        } else {
+            Collections.sort(row, GlyphRun.LTR_COMPARATOR);
+        }*/
+
+        xhml.startElement(P);
+        for (GlyphRun run : row) {
+            //figure out if you need to add a space
+            xhml.characters(run.unicodeString);
+        }
+        xhml.endElement(P);
+    }
+
+    //returns a List of rows (where a row is a list of glyphruns)
+    //the List is sorted in increasing order of the first y of each row
+    private List<List<GlyphRun>> buildRows(List<GlyphRun> glyphRuns) {
+        List<List<GlyphRun>> rows = new ArrayList<>();
+        float maxY = -1.0f;
+        for (GlyphRun glyphRun : glyphRuns) {
+            if (rows.size() == 0) {
+                List<GlyphRun> row = new ArrayList<>();
+                row.add(glyphRun);
+                rows.add(row);
+                continue;
+            } else {
+                boolean addedNewRow = false;
+                //can rely on the last row having the highest y
+                List<GlyphRun> row = rows.get(rows.size()-1);
+                //0.5 is a purely heuristic/magical number that should be derived
+                //from the data, not made up. TODO: fix this
+                if (Math.abs(glyphRun.originY -row.get(0).originY) < 0.5) {
+                    row.add(glyphRun);
+                } else {
+                    row = new ArrayList<>();
+                    row.add(glyphRun);
+                    rows.add(row);
+                    addedNewRow = true;
+                }
+                //sort rows so that they are in ascending order of y
+                //in most xps files in our test corpus, this is never triggered
+                //because the runs are already ordered correctly
+                if (maxY > -1.0f && addedNewRow && glyphRun.originY < maxY) {
+                    Collections.sort(rows, ROW_SORTER);
+                }
+                if (glyphRun.originY > maxY) {
+                    maxY = glyphRun.originY;
+                }
+            }
+        }
+        return rows;
+    }
+
+    private static String getVal(String localName, Attributes atts) {
+        for (int i = 0; i < atts.getLength(); i++) {
+            if (localName.equals(atts.getLocalName(i))) {
+                return atts.getValue(i);
+            }
+        }
+        return null;
+    }
+
+    final static class GlyphRun {
+
+        private enum DIRECTION {
+            LTR,
+            RTL
+        }
+
+        //TODO: use name in conjunction with Frag information
+        //to do a better job of extracting paragraph and table structure
+        private final String name;
+        private final float originY;
+        private final float originX;//not currently used, but could be used for bidi text calculations
+        private final String unicodeString;
+        private final String indicesString;//not currently used, but could be used for width calculations
+
+        //not used yet
+        private final DIRECTION direction;
+
+        private GlyphRun(String name, float originY, float originX, String unicodeString, Integer bidiLevel, String indicesString) {
+            this.name = name;
+            this.unicodeString = unicodeString;
+            this.originY = originY;
+            this.originX = originX;
+            if (bidiLevel == null) {
+                direction = DIRECTION.LTR;
+            } else {
+                if (bidiLevel % 2 == 0) {
+                    direction = DIRECTION.LTR;
+                } else {
+                    direction = DIRECTION.RTL;
+                }
+            }
+            this.indicesString = indicesString;
+        }
+    }
+
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
new file mode 100644
index 0000000..30aaf0f
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xps;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLProperties;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.xmlbeans.XmlException;
+
+import java.io.IOException;
+
+/**
+ * Currently, mostly a pass-through class to hold pkg and properties
+ * and keep the general framework similar to our other POI-integrated
+ * extractors.
+ */
+public class XPSTextExtractor extends POIXMLTextExtractor {
+
+    private final OPCPackage pkg;
+    private final POIXMLProperties properties;
+
+    public XPSTextExtractor(OPCPackage pkg) throws OpenXML4JException, XmlException, IOException {
+        super((POIXMLDocument)null);
+        this.pkg = pkg;
+        this.properties = new POIXMLProperties(pkg);
+
+    }
+
+    @Override
+    public OPCPackage getPackage() {
+        return pkg;
+    }
+
+    @Override
+    public String getText() {
+        return null;
+    }
+    public POIXMLProperties.CoreProperties getCoreProperties() {
+        return this.properties.getCoreProperties();
+    }
+
+    public POIXMLProperties.ExtendedProperties getExtendedProperties() {
+        return this.properties.getExtendedProperties();
+    }
+
+    public POIXMLProperties.CustomProperties getCustomProperties() {
+        return this.properties.getCustomProperties();
+    }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 9a5befa..4195fc7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -64,6 +64,9 @@ public class ZipContainerDetector implements Detector {
     // TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes 
     private static final String STRICT_CORE_DOCUMENT = 
             "http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";
+
+    private static final String XPS_DOCUMENT =
+            "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
     
     /** Serial version UID */
     private static final long serialVersionUID = 2891763938430295453L;
@@ -245,6 +248,9 @@ public class ZipContainerDetector implements Detector {
         PackagePart corePart = pkg.getPart(core.getRelationship(0));
         String coreType = corePart.getContentType();
 
+        if (coreType.contains(".xps")) {
+            return MediaType.application("vnd.ms-package.xps");
+        }
         // Turn that into the type of the overall document
         String docType = coreType.substring(0, coreType.lastIndexOf('.'));
 
@@ -263,7 +269,7 @@ public class ZipContainerDetector implements Detector {
     /**
      * Detects Open XML Paper Specification (XPS)
      */
-    private static MediaType detectXPSOPC(OPCPackage pkg) {
+    public static MediaType detectXPSOPC(OPCPackage pkg) {
         PackageRelationshipCollection xps = 
                 pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation");
         if (xps.size() == 1) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
new file mode 100644
index 0000000..4fb4488
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
@@ -0,0 +1,84 @@
+package org.apache.tika.parser.microsoft.ooxml.xps;
+
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.junit.Test;
+
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static org.junit.Assert.assertEquals;
+
+public class XPSParserTest extends TikaTest {
+
+    @Test
+    public void testBasic() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testPPT.xps");
+        assertEquals(2, metadataList.size());
+
+        //metadata
+        assertEquals("Rajiv", metadataList.get(0).get(TikaCoreProperties.CREATOR));
+        assertEquals("2010-06-29T12:06:31Z", metadataList.get(0).get(TikaCoreProperties.CREATED));
+        assertEquals("2010-06-29T12:06:31Z", metadataList.get(0).get(TikaCoreProperties.MODIFIED));
+        assertEquals("Attachment Test", metadataList.get(0).get(TikaCoreProperties.TITLE));
+
+        String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+        assertContains("<p>Attachment Test</p>", content);
+        assertContains("<div class=\"canvas\"><p>Different", content);
+
+        //I'd want this to be "tika content", but copy+paste in Windows yields tikacontent
+        assertContains("tikacontent", content);
+
+
+        assertEquals("image/jpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testVarious() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testXPS_various.xps");
+        //confirm embedded images and thumbnails were extracted
+        assertEquals(4, metadataList.size());
+
+        //now check for content in the right order
+        String quickBrownFox = "\u0644\u062B\u0639\u0644\u0628\u0020" +
+                "\u0627\u0644\u0628\u0646\u064A\u0020" +
+                "\u0627\u0644\u0633\u0631\u064A\u0639";
+
+        String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+        assertContains(quickBrownFox, content);
+
+        assertContains("The \u0627\u0644\u0628\u0646\u064A fox", content);
+
+        assertContains("\u0644\u062B\u0639\u0644\u0628 brown \u0627\u0644\u0633\u0631\u064A\u0639",
+                content);
+
+        //make sure the urls come through
+        assertContains("<a href=\"http://tika.apache.org/\">http://tika.apache.org/</a>",
+                content);
+
+        Metadata metadata = metadataList.get(0);
+        assertEquals("Allison, Timothy B.", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("2017-12-12T11:15:38Z", metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2017-12-12T11:15:38Z", metadata.get(TikaCoreProperties.MODIFIED));
+
+
+        assertEquals("image/png", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+
+        Metadata inlineJpeg = metadataList.get(2);
+        assertEquals("image/jpeg", inlineJpeg.get(Metadata.CONTENT_TYPE));
+        assertContains("INetCache", inlineJpeg.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+        assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString(),
+                inlineJpeg.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+
+        assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE));
+//        assertEquals(TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString(),
+  //              inlineJpeg.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+
+
+    }
+
+}
diff --git a/tika-parsers/src/test/resources/test-documents/testXPS_various.xps b/tika-parsers/src/test/resources/test-documents/testXPS_various.xps
new file mode 100644
index 0000000..a5186d1
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testXPS_various.xps differ

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.