You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/12/13 15:29:56 UTC
[tika] 01/02: TIKA-2524 -- add an XPS parser
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 78c8d74a34ccf8ef5d49ba7a242687e1423ca952
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Dec 13 10:11:51 2017 -0500
TIKA-2524 -- add an XPS parser
---
CHANGES.txt | 4 +
.../parser/microsoft/ooxml/MetadataExtractor.java | 4 +-
.../microsoft/ooxml/OOXMLExtractorFactory.java | 15 +-
.../tika/parser/microsoft/ooxml/OOXMLParser.java | 8 +-
.../microsoft/ooxml/xps/XPSExtractorDecorator.java | 262 ++++++++++++++
.../microsoft/ooxml/xps/XPSPageContentHandler.java | 377 +++++++++++++++++++++
.../microsoft/ooxml/xps/XPSTextExtractor.java | 66 ++++
.../tika/parser/pkg/ZipContainerDetector.java | 8 +-
.../parser/microsoft/ooxml/xps/XPSParserTest.java | 84 +++++
.../resources/test-documents/testXPS_various.xps | Bin 0 -> 560946 bytes
10 files changed, 822 insertions(+), 6 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 69307a9..9b27891 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,7 @@
+Release 2.0.0 - ???
+
+ * Add a parser for XPS (TIKA-2524).
+
Release 1.17 - December 8, 2017
***NOTE: THIS IS THE LAST VERSION OF TIKA THAT WILL RUN
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index 21c6252..dbbb839 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -36,6 +36,7 @@ import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.microsoft.SummaryExtractor;
+import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
@@ -61,7 +62,8 @@ public class MetadataExtractor {
if (extractor.getDocument() != null ||
((extractor instanceof XSSFEventBasedExcelExtractor ||
extractor instanceof XWPFEventBasedWordExtractor ||
- extractor instanceof XSLFEventBasedPowerPointExtractor) &&
+ extractor instanceof XSLFEventBasedPowerPointExtractor ||
+ extractor instanceof XPSTextExtractor) &&
extractor.getPackage() != null)) {
extractMetadata(extractor.getCoreProperties(), metadata);
extractMetadata(extractor.getExtendedProperties(), metadata);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index b6f7bf5..5230d65 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -45,6 +45,8 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.ooxml.xps.XPSExtractorDecorator;
+import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.apache.tika.parser.pkg.ZipContainerDetector;
@@ -66,7 +68,7 @@ public class OOXMLExtractorFactory {
ExtractorFactory.setThreadPrefersEventExtractors(true);
try {
- OOXMLExtractor extractor;
+ OOXMLExtractor extractor = null;
OPCPackage pkg;
// Locate or Open the OPCPackage for the file
@@ -83,13 +85,16 @@ public class OOXMLExtractorFactory {
// Get the type, and ensure it's one we handle
MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
+ if (type == null) {
+ type = ZipContainerDetector.detectXPSOPC(pkg);
+ }
+
if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
// Not a supported type, delegate to Empty Parser
EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
return;
}
metadata.set(Metadata.CONTENT_TYPE, type.toString());
-
// Have the appropriate OOXML text extractor picked
POIXMLTextExtractor poiExtractor = null;
// This has already been set by OOXMLParser's call to configure()
@@ -101,6 +106,10 @@ public class OOXMLExtractorFactory {
if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
poiExtractor = trySXSLF(pkg);
}
+ if (type.equals(OOXMLParser.XPS)) {
+ poiExtractor = new XPSTextExtractor(pkg);
+ }
+
if (poiExtractor == null) {
poiExtractor = ExtractorFactory.createExtractor(pkg);
}
@@ -119,6 +128,8 @@ public class OOXMLExtractorFactory {
extractor = new SXSLFPowerPointExtractorDecorator(metadata, context,
(XSLFEventBasedPowerPointExtractor) poiExtractor);
metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
+ } else if (poiExtractor instanceof XPSTextExtractor) {
+ extractor = new XPSExtractorDecorator(context, poiExtractor);
} else if (document == null) {
throw new TikaException(
"Expecting UserModel based POI OOXML extractor with a document, but none found. " +
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
index fbc0f93..81ec4b6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
@@ -41,6 +41,8 @@ public class OOXMLParser extends AbstractOfficeParser {
ZipSecureFile.setMinInflateRatio(-1.0d);
}
+ protected static final MediaType XPS = MediaType.application("vnd.ms-xpsdocument");
+
protected static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"),
@@ -84,9 +86,11 @@ public class OOXMLParser extends AbstractOfficeParser {
* by Tika and/or POI.
*/
protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES =
- Collections.singleton(
+ Collections.EMPTY_SET;
+ //TODO: should we do a singleton for dwfx+xps?
+ /*Collections.singleton(
MediaType.application("vnd.ms-xpsdocument")
- );
+ );*/
/**
* Serial version UID
*/
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
new file mode 100644
index 0000000..689db28
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
@@ -0,0 +1,262 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xps;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.ZipPackage;
+import org.apache.poi.openxml4j.util.ZipEntrySource;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.ExceptionUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.zip.ZipEntry;
+
+public class XPSExtractorDecorator extends AbstractOOXMLExtractor {
+
+ private static String XPS_DOCUMENT = "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
+
+ private final ParseContext context;
+ private final ZipPackage pkg;
+ Map<String, Metadata> embeddedImages = new HashMap<>();
+
+ public XPSExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor) throws TikaException {
+ super(context, extractor);
+ this.context = context;
+ if (extractor.getPackage() instanceof ZipPackage) {
+ this.pkg = (ZipPackage) extractor.getPackage();
+ } else {
+ throw new TikaException("OPCPackage must be a ZipPackage");
+ }
+ }
+
+ @Override
+ public POIXMLDocument getDocument() {
+ return null;
+ }
+
+
+ @Override
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
+
+ PackageRelationshipCollection prc = pkg.getRelationshipsByType(XPS_DOCUMENT);
+ for (int i = 0; i < prc.size(); i++) {
+ PackageRelationship pr = prc.getRelationship(i);
+
+ //there should only be one.
+ //in the test file, this points to FixedDocSeq.fdseq
+ try {
+ handleDocuments(pr, xhtml);
+ } catch (TikaException e) {
+ throw new SAXException(e);
+ }
+ }
+
+ //now handle embedded images
+ if (embeddedImages.size() > 0) {
+ EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
+ for (Map.Entry<String, Metadata> embeddedImage : embeddedImages.entrySet()) {
+ String zipPath = embeddedImage.getKey();
+ Metadata metadata = embeddedImage.getValue();
+ if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
+ handleEmbeddedImage(
+ zipPath,
+ metadata,
+ embeddedDocumentUtil,
+ xhtml);
+ }
+ }
+ }
+
+ }
+
+ private void handleEmbeddedImage(String zipPath, Metadata metadata,
+ EmbeddedDocumentUtil embeddedDocumentUtil,
+ XHTMLContentHandler xhtml) throws SAXException, IOException {
+ InputStream stream = null;
+ try {
+ stream = getZipStream(zipPath, pkg);
+ } catch (IOException|TikaException e) {
+ //store this exception in the parent's metadata
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
+ return;
+ }
+
+ try {
+ embeddedDocumentUtil.parseEmbedded(stream, xhtml, metadata, true);
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+ }
+
+ private void handleDocuments(PackageRelationship packageRelationship,
+ XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
+
+ try (InputStream stream = pkg.getPart(packageRelationship).getInputStream()) {
+ context.getSAXParser().parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new EmbeddedContentHandler(
+ new FixedDocSeqHandler(xhtml))));
+ }
+ }
+
+ @Override
+ protected List<PackagePart> getMainDocumentParts() throws TikaException {
+ return Collections.EMPTY_LIST;
+ }
+
+ private class FixedDocSeqHandler extends DefaultHandler {
+ private final static String DOCUMENT_REFERENCE = "DocumentReference";
+ private final static String SOURCE = "Source";
+
+ private final XHTMLContentHandler xhtml;
+
+ private FixedDocSeqHandler(XHTMLContentHandler xhtml) {
+ this.xhtml = xhtml;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ if (!DOCUMENT_REFERENCE.equals(localName)) {
+ return;
+ }
+ for (int i = 0; i < atts.getLength(); i++) {
+ String lName = atts.getLocalName(i);
+ if (SOURCE.equals(lName)) {
+ handleDocumentRef(atts.getValue(i));
+ }
+ }
+ }
+
+ private void handleDocumentRef(String docRef) throws SAXException {
+ //docRef is a path to a FixedDocumentSequence document,
+ // e.g. /Documents/1/FixedDoc.fdoc
+
+ //relative root is /Documents/1 ..need this Pages...
+ String relativeRoot = null;
+ int i = docRef.lastIndexOf("/");
+ if (i > 0) {
+ relativeRoot = docRef.substring(0, i);
+ } else {
+ relativeRoot = "";
+ }
+ String zipPath = (docRef.startsWith("/") ? docRef.substring(1) : docRef);
+ if (pkg instanceof ZipPackage) {
+ try (InputStream stream = getZipStream(zipPath, pkg)) {
+ context.getSAXParser().parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new EmbeddedContentHandler(
+ new PageContentPartHandler(relativeRoot, xhtml))));
+
+ } catch (IOException | TikaException e) {
+ throw new SAXException(new TikaException("IOException trying to read: " + docRef));
+ }
+ } else {
+ throw new SAXException(new TikaException("Package must be ZipPackage"));
+ }
+ }
+
+ private class PageContentPartHandler extends DefaultHandler {
+ private static final String PAGE_CONTENT = "PageContent";
+ private static final String SOURCE = "Source";
+
+ private final String relativeRoot;
+ private final XHTMLContentHandler xhtml;
+
+ private PageContentPartHandler(String relativeRoot, XHTMLContentHandler xhtml) {
+ this.relativeRoot = relativeRoot;
+ this.xhtml = xhtml;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ if (!PAGE_CONTENT.equals(localName)) {
+ return;
+ }
+ String pagePath = null;
+ for (int i = 0; i < atts.getLength(); i++) {
+ if (SOURCE.equals(atts.getLocalName(i))) {
+ pagePath = atts.getValue(i);
+ break;
+ }
+ }
+
+ if (pagePath != null) {
+ if (!pagePath.startsWith("/")) {
+ pagePath = relativeRoot + "/" + pagePath;
+ }
+ //trim initial /
+ if (pagePath.startsWith("/")) {
+ pagePath = pagePath.substring(1);
+ }
+ try (InputStream stream = getZipStream(pagePath, pkg)) {
+ context.getSAXParser().parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(
+ new XPSPageContentHandler(xhtml, embeddedImages)
+ )
+ );
+ } catch (TikaException | IOException e) {
+ throw new SAXException(e);
+ }
+ }
+
+ }
+ }
+ }
+
+ private static InputStream getZipStream(String zipPath, ZipPackage zipPackage) throws IOException, TikaException {
+ String targPath = (zipPath.length() > 1 && zipPath.startsWith("/") ? zipPath.substring(1) : zipPath);
+ ZipEntrySource zipEntrySource = zipPackage.getZipArchive();
+ Enumeration<? extends ZipEntry> zipEntryEnumeration = zipEntrySource.getEntries();
+ ZipEntry zipEntry = null;
+ while (zipEntryEnumeration.hasMoreElements()) {
+ ZipEntry ze = zipEntryEnumeration.nextElement();
+ if (ze.getName().equals(targPath)) {
+ zipEntry = ze;
+ break;
+ }
+ }
+ if (zipEntry == null) {
+ throw new TikaException("Couldn't find required zip entry: " + zipPath);
+ }
+ return zipEntrySource.getInputStream(zipEntry);
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSPageContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSPageContentHandler.java
new file mode 100644
index 0000000..d18825d
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSPageContentHandler.java
@@ -0,0 +1,377 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml.xps;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+
+
+/**
+ * Handles an individual page. For now, this marks up
+ * canvas entities in a <div> tag. Based on the spec,
+ * it currently relies on order within the xml for order of output
+ * of text to xhtml. We could do more complex processing of coordinates
+ * for bidi-languages, but the spec implies that we should be able
+ * to rely on storage order.
+ * <p/>
+ * As with our PDFParser, this currently dumps urls at the bottom of the page
+ * and does not attempt to calculate the correct anchor text.
+ * <p/>
+ * TODO: integrate table markup
+ */
+class XPSPageContentHandler extends DefaultHandler {
+
+ private static final String GLYPHS = "Glyphs";
+ private static final String CANVAS = "Canvas";
+ private static final String CLIP = "Clip";
+ private static final String NULL_CLIP = "NULL_CLIP";
+ private static final String UNICODE_STRING = "UnicodeString";
+ private static final String ORIGIN_X = "OriginX";
+ private static final String ORIGIN_Y = "OriginY";
+ private static final String BIDI_LEVEL = "BidiLevel";
+ private static final String INDICES = "Indices";
+ private static final String NAME = "Name";
+ private static final String PATH = "Path";
+ private static final String NAVIGATE_URI = "FixedPage.NavigateUri";
+ private static final String IMAGE_SOURCE = "ImageSource";
+ private static final String IMAGE_BRUSH = "ImageBrush";
+ private static final String AUTOMATION_PROPERITES_HELP_TEXT = "AutomationProperties.HelpText";
+
+ private static final String URL_DIV = "urls";
+ private static final String DIV = "div";
+ private static final String CLASS = "class";
+ private static final String PAGE = "page";
+ private static final String CANVAS_SAX = "canvas";
+ private static final String P = "p";
+ private static final String HREF = "href";
+ private static final String A = "a";
+
+
+ private final XHTMLContentHandler xhml;
+
+ //path in zip file for an image rendered on this page
+ private String imageSourcePathInZip = null;
+ //embedded images sometimes include full path info of original image
+ private String originalLocationOnDrive = null;
+
+ //buffer for the glyph runs within a given canvas
+ //in insertion order
+ private Map<String, List<GlyphRun>> canvases = new LinkedHashMap<>();
+
+ private Set<String> urls = new LinkedHashSet();
+ private Stack<String> canvasStack = new Stack<>();
+ private final Map<String, Metadata> embeddedInfos;
+ //sort based on y coordinate of first element in each row
+ //this requires every row to have at least one element
+ private static Comparator<? super List<GlyphRun>> ROW_SORTER = new Comparator<List<GlyphRun>>() {
+ @Override
+ public int compare(List<GlyphRun> o1, List<GlyphRun> o2) {
+ if (o1.get(0).originY < o2.get(0).originY) {
+ return -1;
+ } else if (o1.get(0).originY > o2.get(0).originY) {
+ return 1;
+ }
+ return 0;
+ }
+ };
+
+ public XPSPageContentHandler(XHTMLContentHandler xhtml, Map<String, Metadata> embeddedInfos) {
+ this.xhml = xhtml;
+ this.embeddedInfos = embeddedInfos;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ if (CANVAS.equals(localName)) {
+ String clip = getVal(CLIP, atts);
+ if (clip == null) {
+ canvasStack.push(NULL_CLIP);
+ } else {
+ canvasStack.push(clip);
+ }
+ return;
+ } else if (PATH.equals(localName)) {
+ //for now just grab them and dump them at the end of the page.
+ String url = getVal(NAVIGATE_URI, atts);
+ if (url != null) {
+ urls.add(url);
+ }
+ originalLocationOnDrive = getVal(AUTOMATION_PROPERITES_HELP_TEXT, atts);
+ } else if (IMAGE_BRUSH.equals(localName)) {
+ imageSourcePathInZip = getVal(IMAGE_SOURCE, atts);
+ }
+
+ if (!GLYPHS.equals(localName)) {
+ return;
+ }
+ String name = null;
+ Float originX = null;
+ Float originY = null;
+ String unicodeString = null;
+ Integer bidilevel = 1;
+ String indicesString = null;
+
+ for (int i = 0; i < atts.getLength(); i++) {
+ String lName = atts.getLocalName(i);
+ String value = atts.getValue(i);
+ value = (value == null) ? "" : value.trim();
+
+ if (ORIGIN_X.equals(lName) && value.length() > 0) {
+ try {
+ originX = Float.parseFloat(atts.getValue(i));
+ } catch (NumberFormatException e) {
+ throw new SAXException(e);
+ }
+ } else if (ORIGIN_Y.equals(lName) && value.length() > 0) {
+ try {
+ originY = Float.parseFloat(atts.getValue(i));
+ } catch (NumberFormatException e) {
+ throw new SAXException(e);
+ }
+ } else if (UNICODE_STRING.equals(lName)) {
+ unicodeString = atts.getValue(i);
+ } else if (BIDI_LEVEL.equals(lName) && value.length() > 0) {
+ try {
+ bidilevel = Integer.parseInt(atts.getValue(i));
+ } catch (NumberFormatException e) {
+ throw new SAXException(e);
+ }
+ } else if (INDICES.equals(lName)) {
+ indicesString = atts.getValue(i);
+ } else if (NAME.equals(lName)) {
+ name = value;
+ }
+ }
+ if (unicodeString != null) {
+ originX = (originX == null) ? Integer.MIN_VALUE : originX;
+ originY = (originY == null) ? Integer.MAX_VALUE : originY;
+ String currentCanvasClip = (canvasStack.size() > 0) ? canvasStack.peek() : NULL_CLIP;
+ List<GlyphRun> runs = canvases.get(currentCanvasClip);
+ if (runs == null) {
+ runs = new ArrayList<>();
+ }
+ runs.add(new GlyphRun(name, originY, originX, unicodeString, bidilevel, indicesString));
+ canvases.put(currentCanvasClip, runs);
+ }
+
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ if (CANVAS.equals(localName)) {
+ if (! canvasStack.isEmpty()) {
+ canvasStack.pop();
+ }
+ } else if (PATH.equals(localName)) {
+ //this assumes that there cannot be a path within a path
+ //not sure if this is true or if we need to track path depth
+ if (imageSourcePathInZip != null) {
+ Metadata m = embeddedInfos.get(imageSourcePathInZip);
+ if (m == null) {
+ m = new Metadata();
+ }
+ if (originalLocationOnDrive != null) {
+ String val = m.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME);
+ if (val == null) {
+ m.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, originalLocationOnDrive);
+ }
+ }
+ m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+ embeddedInfos.put(imageSourcePathInZip, m);
+ }
+ //reset
+ imageSourcePathInZip = null;
+ originalLocationOnDrive = null;
+ }
+ }
+ @Override
+ public void startDocument() throws SAXException {
+ xhml.startElement(DIV, CLASS, PAGE);
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ writePage();
+ xhml.endElement(DIV);
+ }
+
+
+ private final void writePage() throws SAXException {
+ if (canvases.size() == 0) {
+ return;
+ }
+
+ for (Map.Entry<String, List<GlyphRun>> e : canvases.entrySet()) {
+ String clip = e.getKey();
+ List<GlyphRun> runs = e.getValue();
+ if (runs.size() == 0) {
+ continue;
+ }
+ xhml.startElement(DIV, CLASS, CANVAS_SAX);
+ //a list of rows sorted by the y of the first element in each row
+ List<List<GlyphRun>> rows = buildRows(runs);
+ for (List<GlyphRun> row : rows) {
+ writeRow(row);
+ }
+ xhml.endElement(DIV);
+ }
+ //for now just dump the urls at the end of the page
+ //At some point, we could link them back up to their
+ //true anchor text.
+ if (urls.size() > 0) {
+ xhml.startElement(DIV, CLASS, URL_DIV);
+ for (String u : urls) {
+ xhml.startElement(A, HREF, u);
+ xhml.characters(u);
+ xhml.endElement(A);
+ }
+ xhml.endElement(DIV);
+ }
+ canvases.clear();
+ }
+
+ private void writeRow(List<GlyphRun> row) throws SAXException {
+/*
+ int rtl = 0;
+ int ltr = 0;
+ //if the row is entirely rtl, sort all as rtl
+ //otherwise sort ltr
+ for (GlyphRun r : row) {
+ //ignore directionality of pure spaces
+ if (r.unicodeString == null || r.unicodeString.trim().length() == 0) {
+ continue;
+ }
+ if (r.direction == GlyphRun.DIRECTION.RTL) {
+ rtl++;
+ } else {
+ ltr++;
+ }
+ }
+ if (rtl > 0 && ltr == 0) {
+ Collections.sort(row, GlyphRun.RTL_COMPARATOR);
+ } else {
+ Collections.sort(row, GlyphRun.LTR_COMPARATOR);
+ }*/
+
+ xhml.startElement(P);
+ for (GlyphRun run : row) {
+ //figure out if you need to add a space
+ xhml.characters(run.unicodeString);
+ }
+ xhml.endElement(P);
+ }
+
+ //returns a List of rows (where a row is a list of glyphruns)
+ //the List is sorted in increasing order of the first y of each row
+ private List<List<GlyphRun>> buildRows(List<GlyphRun> glyphRuns) {
+ List<List<GlyphRun>> rows = new ArrayList<>();
+ float maxY = -1.0f;
+ for (GlyphRun glyphRun : glyphRuns) {
+ if (rows.size() == 0) {
+ List<GlyphRun> row = new ArrayList<>();
+ row.add(glyphRun);
+ rows.add(row);
+ continue;
+ } else {
+ boolean addedNewRow = false;
+ //can rely on the last row having the highest y
+ List<GlyphRun> row = rows.get(rows.size()-1);
+ //0.5 is a purely heuristic/magical number that should be derived
+ //from the data, not made up. TODO: fix this
+ if (Math.abs(glyphRun.originY -row.get(0).originY) < 0.5) {
+ row.add(glyphRun);
+ } else {
+ row = new ArrayList<>();
+ row.add(glyphRun);
+ rows.add(row);
+ addedNewRow = true;
+ }
+ //sort rows so that they are in ascending order of y
+ //in most xps files in our test corpus, this is never triggered
+ //because the runs are already ordered correctly
+ if (maxY > -1.0f && addedNewRow && glyphRun.originY < maxY) {
+ Collections.sort(rows, ROW_SORTER);
+ }
+ if (glyphRun.originY > maxY) {
+ maxY = glyphRun.originY;
+ }
+ }
+ }
+ return rows;
+ }
+
+ private static String getVal(String localName, Attributes atts) {
+ for (int i = 0; i < atts.getLength(); i++) {
+ if (localName.equals(atts.getLocalName(i))) {
+ return atts.getValue(i);
+ }
+ }
+ return null;
+ }
+
+ final static class GlyphRun {
+
+ private enum DIRECTION {
+ LTR,
+ RTL
+ }
+
+ //TODO: use name in conjunction with Frag information
+ //to do a better job of extracting paragraph and table structure
+ private final String name;
+ private final float originY;
+ private final float originX;//not currently used, but could be used for bidi text calculations
+ private final String unicodeString;
+ private final String indicesString;//not currently used, but could be used for width calculations
+
+ //not used yet
+ private final DIRECTION direction;
+
+ private GlyphRun(String name, float originY, float originX, String unicodeString, Integer bidiLevel, String indicesString) {
+ this.name = name;
+ this.unicodeString = unicodeString;
+ this.originY = originY;
+ this.originX = originX;
+ if (bidiLevel == null) {
+ direction = DIRECTION.LTR;
+ } else {
+ if (bidiLevel % 2 == 0) {
+ direction = DIRECTION.LTR;
+ } else {
+ direction = DIRECTION.RTL;
+ }
+ }
+ this.indicesString = indicesString;
+ }
+ }
+
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
new file mode 100644
index 0000000..30aaf0f
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xps;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLProperties;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.xmlbeans.XmlException;
+
+import java.io.IOException;
+
+/**
+ * Currently, mostly a pass-through class to hold pkg and properties
+ * and keep the general framework similar to our other POI-integrated
+ * extractors.
+ */
+public class XPSTextExtractor extends POIXMLTextExtractor {
+
+ private final OPCPackage pkg;
+ private final POIXMLProperties properties;
+
+ public XPSTextExtractor(OPCPackage pkg) throws OpenXML4JException, XmlException, IOException {
+ super((POIXMLDocument)null);
+ this.pkg = pkg;
+ this.properties = new POIXMLProperties(pkg);
+
+ }
+
+ @Override
+ public OPCPackage getPackage() {
+ return pkg;
+ }
+
+ @Override
+ public String getText() {
+ return null;
+ }
+ public POIXMLProperties.CoreProperties getCoreProperties() {
+ return this.properties.getCoreProperties();
+ }
+
+ public POIXMLProperties.ExtendedProperties getExtendedProperties() {
+ return this.properties.getExtendedProperties();
+ }
+
+ public POIXMLProperties.CustomProperties getCustomProperties() {
+ return this.properties.getCustomProperties();
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 9a5befa..4195fc7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -64,6 +64,9 @@ public class ZipContainerDetector implements Detector {
// TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
private static final String STRICT_CORE_DOCUMENT =
"http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument";
+
+ private static final String XPS_DOCUMENT =
+ "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
/** Serial version UID */
private static final long serialVersionUID = 2891763938430295453L;
@@ -245,6 +248,9 @@ public class ZipContainerDetector implements Detector {
PackagePart corePart = pkg.getPart(core.getRelationship(0));
String coreType = corePart.getContentType();
+ if (coreType.contains(".xps")) {
+ return MediaType.application("vnd.ms-package.xps");
+ }
// Turn that into the type of the overall document
String docType = coreType.substring(0, coreType.lastIndexOf('.'));
@@ -263,7 +269,7 @@ public class ZipContainerDetector implements Detector {
/**
* Detects Open XML Paper Specification (XPS)
*/
- private static MediaType detectXPSOPC(OPCPackage pkg) {
+ public static MediaType detectXPSOPC(OPCPackage pkg) {
PackageRelationshipCollection xps =
pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation");
if (xps.size() == 1) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
new file mode 100644
index 0000000..4fb4488
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
@@ -0,0 +1,84 @@
+package org.apache.tika.parser.microsoft.ooxml.xps;
+
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.junit.Test;
+
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static org.junit.Assert.assertEquals;
+
+public class XPSParserTest extends TikaTest {
+
+ @Test
+ public void testBasic() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testPPT.xps");
+ assertEquals(2, metadataList.size());
+
+ //metadata
+ assertEquals("Rajiv", metadataList.get(0).get(TikaCoreProperties.CREATOR));
+ assertEquals("2010-06-29T12:06:31Z", metadataList.get(0).get(TikaCoreProperties.CREATED));
+ assertEquals("2010-06-29T12:06:31Z", metadataList.get(0).get(TikaCoreProperties.MODIFIED));
+ assertEquals("Attachment Test", metadataList.get(0).get(TikaCoreProperties.TITLE));
+
+ String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertContains("<p>Attachment Test</p>", content);
+ assertContains("<div class=\"canvas\"><p>Different", content);
+
+ //I'd want this to be "tika content", but copy+paste in Windows yields tikacontent
+ assertContains("tikacontent", content);
+
+
+ assertEquals("image/jpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+ }
+
+ @Test
+ public void testVarious() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testXPS_various.xps");
+ //confirm embedded images and thumbnails were extracted
+ assertEquals(4, metadataList.size());
+
+ //now check for content in the right order
+ String quickBrownFox = "\u0644\u062B\u0639\u0644\u0628\u0020" +
+ "\u0627\u0644\u0628\u0646\u064A\u0020" +
+ "\u0627\u0644\u0633\u0631\u064A\u0639";
+
+ String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertContains(quickBrownFox, content);
+
+ assertContains("The \u0627\u0644\u0628\u0646\u064A fox", content);
+
+ assertContains("\u0644\u062B\u0639\u0644\u0628 brown \u0627\u0644\u0633\u0631\u064A\u0639",
+ content);
+
+ //make sure the urls come through
+ assertContains("<a href=\"http://tika.apache.org/\">http://tika.apache.org/</a>",
+ content);
+
+ Metadata metadata = metadataList.get(0);
+ assertEquals("Allison, Timothy B.", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("2017-12-12T11:15:38Z", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2017-12-12T11:15:38Z", metadata.get(TikaCoreProperties.MODIFIED));
+
+
+ assertEquals("image/png", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+
+ Metadata inlineJpeg = metadataList.get(2);
+ assertEquals("image/jpeg", inlineJpeg.get(Metadata.CONTENT_TYPE));
+ assertContains("INetCache", inlineJpeg.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString(),
+ inlineJpeg.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+
+ assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE));
+// assertEquals(TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString(),
+ // inlineJpeg.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+
+
+ }
+
+}
diff --git a/tika-parsers/src/test/resources/test-documents/testXPS_various.xps b/tika-parsers/src/test/resources/test-documents/testXPS_various.xps
new file mode 100644
index 0000000..a5186d1
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testXPS_various.xps differ
--
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.