You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/01/11 15:37:46 UTC
[4/5] tika git commit: TIKA-2210 -- add experimental SAX parser for
pptx and update (also TIKA-2191 and TIKA-2220)
http://git-wip-us.apache.org/repos/asf/tika/blob/68161573/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
new file mode 100644
index 0000000..a7de780
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackagePartName;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.PackagingURIHelper;
+import org.apache.poi.openxml4j.opc.TargetMode;
+import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.usermodel.XSLFRelation;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * SAX/Streaming pptx extractior
+ */
+public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
+
+ private final static String HANDOUT_MASTER = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/handoutMaster";
+
+ //a pptx file should have one of these "main story" parts
+ private final static String[] MAIN_STORY_PART_RELATIONS = new String[]{
+ XSLFRelation.MAIN.getContentType(),
+ XSLFRelation.PRESENTATION_MACRO.getContentType(),
+ XSLFRelation.PRESENTATIONML.getContentType(),
+ XSLFRelation.PRESENTATIONML_TEMPLATE.getContentType(),
+ XSLFRelation.MACRO.getContentType(),
+ XSLFRelation.MACRO_TEMPLATE.getContentType(),
+ XSLFRelation.THEME_MANAGER.getContentType()
+
+
+ //TODO: what else
+ };
+
+ private final OPCPackage opcPackage;
+ private final ParseContext context;
+ private PackagePart mainDocument = null;
+ private final CommentAuthors commentAuthors = new CommentAuthors();
+
+ public SXSLFPowerPointExtractorDecorator(ParseContext context, XSLFEventBasedPowerPointExtractor extractor) {
+ super(context, extractor);
+ this.context = context;
+ this.opcPackage = extractor.getPackage();
+ for (String contentType : MAIN_STORY_PART_RELATIONS) {
+ List<PackagePart> pps = opcPackage.getPartsByContentType(contentType);
+ if (pps.size() > 0) {
+ mainDocument = pps.get(0);
+ break;
+ }
+ }
+ //if mainDocument == null, throw exception
+ }
+
+ /**
+ * @see XSLFPowerPointExtractor#getText()
+ */
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
+
+ loadCommentAuthors();
+
+ //TODO: should check for custShowLst and order based on sldLst
+ try {
+
+ PackageRelationshipCollection prc = mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
+ if (prc.size() == 0) {
+
+ }
+ for (int i = 0; i < prc.size(); i++) {
+ handleSlidePart(mainDocument.getRelatedPart(prc.getRelationship(i)), xhtml);
+ }
+ } catch (InvalidFormatException e) {
+ }
+ handleBasicRelatedParts(XSLFRelation.SLIDE_MASTER.getRelation(),
+ "slide-master",
+ mainDocument,
+ new PlaceHolderSkipper(new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml), new HashMap<String, String>())));
+
+ handleBasicRelatedParts(HANDOUT_MASTER,
+ "slide-handout-master",
+ mainDocument,
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml), new HashMap<String, String>())
+ );
+ }
+
+ private void loadCommentAuthors() {
+ PackageRelationshipCollection prc = null;
+ try {
+ prc = mainDocument.getRelationshipsByType(XSLFRelation.COMMENT_AUTHORS.getRelation());
+ } catch (InvalidFormatException e) {
+ }
+ if (prc == null || prc.size() == 0) {
+ return;
+ }
+
+ for (int i = 0; i < prc.size(); i++) {
+ PackagePart commentAuthorsPart = null;
+ try {
+ commentAuthorsPart = commentAuthorsPart = mainDocument.getRelatedPart(prc.getRelationship(i));
+ } catch (InvalidFormatException e) {
+
+ }
+ if (commentAuthorsPart == null) {
+ continue;
+ }
+ try (InputStream stream = commentAuthorsPart.getInputStream()) {
+ context.getSAXParser().parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new XSLFCommentAuthorHandler()));
+
+ } catch (TikaException | SAXException | IOException e) {
+ //do something with this
+ }
+ }
+
+ }
+
+ private void handleSlidePart(PackagePart slidePart, XHTMLContentHandler xhtml) throws IOException, SAXException {
+ Map<String, String> linkedRelationships = loadLinkedRelationships(slidePart, false);
+
+// Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
+ xhtml.startElement("div", "class", "slide-content");
+ try (InputStream stream = slidePart.getInputStream()) {
+ context.getSAXParser().parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new EmbeddedContentHandler(
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships))));
+
+ } catch (TikaException e) {
+ //do something with this
+ }
+
+ xhtml.endElement("div");
+
+
+ handleBasicRelatedParts(XSLFRelation.SLIDE_LAYOUT.getRelation(),
+ "slide-master-content", slidePart,
+ new PlaceHolderSkipper(new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships))
+ );
+
+ handleBasicRelatedParts(XSLFRelation.NOTES.getRelation(),
+ "slide-notes", slidePart,
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));
+
+ handleBasicRelatedParts(XSLFRelation.NOTES_MASTER.getRelation(),
+ "slide-notes-master", slidePart,
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));
+
+ handleBasicRelatedParts(XSLFRelation.COMMENTS.getRelation(),
+ null, slidePart,
+ new XSLFCommentsHandler(xhtml));
+
+// handleBasicRelatedParts("");
+ }
+
+ /**
+ * This should handle the comments, master, notes, etc
+ *
+ * @param contentType
+ * @param xhtmlClassLabel
+ * @param parentPart
+ * @param contentHandler
+ */
+ private void handleBasicRelatedParts(String contentType, String xhtmlClassLabel,
+ PackagePart parentPart, ContentHandler contentHandler) throws SAXException {
+
+ PackageRelationshipCollection relatedPartPRC = null;
+
+ try {
+ relatedPartPRC = parentPart.getRelationshipsByType(contentType);
+ } catch (InvalidFormatException e) {
+ //swallow
+ }
+ if (relatedPartPRC != null && relatedPartPRC.size() > 0) {
+ AttributesImpl attributes = new AttributesImpl();
+
+ attributes.addAttribute("", "class", "class", "CDATA", xhtmlClassLabel);
+ contentHandler.startElement("", "div", "div", attributes);
+ for (int i = 0; i < relatedPartPRC.size(); i++) {
+ PackageRelationship relatedPartPackageRelationship = relatedPartPRC.getRelationship(i);
+ try {
+ PackagePart relatedPartPart = parentPart.getRelatedPart(relatedPartPackageRelationship);
+ try (InputStream stream = relatedPartPart.getInputStream()) {
+ context.getSAXParser().parse(stream,
+ new OfflineContentHandler(new EmbeddedContentHandler(contentHandler)));
+
+ } catch (IOException|TikaException e) {
+ //do something with this
+ }
+
+ } catch (InvalidFormatException e) {
+ }
+ }
+ contentHandler.endElement("", "div", "div");
+ }
+
+ }
+
+ /**
+ * In PowerPoint files, slides have things embedded in them,
+ * and slide drawings which have the images
+ */
+ @Override
+ protected List<PackagePart> getMainDocumentParts() {
+ List<PackagePart> parts = new ArrayList<>();
+ //TODO: consider: getPackage().getPartsByName(Pattern.compile("/ppt/embeddings/.*?
+ //TODO: consider: getPackage().getPartsByName(Pattern.compile("/ppt/media/.*?
+ try {
+ PackageRelationshipCollection prc = mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
+ for (int i = 0; i < prc.size(); i++) {
+ PackagePart slidePart = mainDocument.getRelatedPart(prc.getRelationship(i));
+ addSlideParts(slidePart, parts);
+ }
+ } catch (InvalidFormatException e) {
+ //log
+ }
+
+ parts.add(mainDocument);
+ for (String rel : new String[]{
+ XSLFRelation.SLIDE_MASTER.getRelation(),
+ HANDOUT_MASTER}) {
+ try {
+ PackageRelationshipCollection prc = mainDocument.getRelationshipsByType(rel);
+ for (int i = 0; i < prc.size(); i++) {
+ PackagePart pp = mainDocument.getRelatedPart(prc.getRelationship(i));
+ if (pp != null) {
+ parts.add(pp);
+ }
+ }
+
+ } catch (InvalidFormatException e) {
+ //log
+ }
+ }
+
+ return parts;
+ }
+
+ private void addSlideParts(PackagePart slidePart, List<PackagePart> parts) {
+
+ for (String relation : new String[]{
+ XSLFRelation.VML_DRAWING.getRelation(),
+ XSLFRelation.SLIDE_LAYOUT.getRelation(),
+ XSLFRelation.NOTES_MASTER.getRelation(),
+ XSLFRelation.NOTES.getRelation()
+ }) {
+ try {
+ for (PackageRelationship packageRelationship : slidePart.getRelationshipsByType(relation)) {
+ if (packageRelationship.getTargetMode() == TargetMode.INTERNAL) {
+ PackagePartName relName = PackagingURIHelper.createPartName(packageRelationship.getTargetURI());
+ parts.add(packageRelationship.getPackage().getPart(relName));
+ }
+ }
+ } catch (InvalidFormatException e) {
+
+ }
+ }
+ //and slide of course
+ parts.add(slidePart);
+
+ }
+
+ private class XSLFCommentsHandler extends DefaultHandler {
+
+ private String commentAuthorId = null;
+ private StringBuilder commentBuffer = new StringBuilder();
+ private XHTMLContentHandler xhtml;
+ XSLFCommentsHandler(XHTMLContentHandler xhtml) {
+ this.xhtml = xhtml;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ if ("cm".equals(localName)) {
+ commentAuthorId = atts.getValue("", "authorId");
+ //get date (dt)?
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ //TODO: require that we're in <p:text>?
+ commentBuffer.append(ch, start, length);
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ if ("cm".equals(localName)) {
+
+ xhtml.startElement("p", "class", "slide-comment");
+
+ String authorString = commentAuthors.getName(commentAuthorId);
+ String authorInitials = commentAuthors.getInitials(commentAuthorId);
+ if (authorString != null || authorInitials != null) {
+ xhtml.startElement("b");
+ boolean authorExists = false;
+ if (authorString != null) {
+ xhtml.characters(authorString.toString());
+ authorExists = true;
+ }
+ if (authorExists && authorInitials != null) {
+ xhtml.characters(" (");
+ }
+ if (authorInitials != null) {
+ xhtml.characters(authorInitials);
+ }
+ if (authorExists && authorInitials != null) {
+ xhtml.characters(")");
+ }
+ xhtml.endElement("b");
+ }
+ xhtml.characters(commentBuffer.toString());
+ xhtml.endElement("p");
+
+ commentBuffer.setLength(0);
+ commentAuthorId = null;
+ }
+ }
+ }
+
+ private class XSLFCommentAuthorHandler extends DefaultHandler {
+ String id = null;
+ String name = null;
+ String initials = null;
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ if ("cmAuthor".equals(localName)) {
+ for (int i = 0; i < atts.getLength(); i++) {
+ if ("id".equals(atts.getLocalName(i))) {
+ id = atts.getValue(i);
+ } else if ("name".equals(atts.getLocalName(i))) {
+ name = atts.getValue(i);
+ } else if ("initials".equals(atts.getLocalName(i))) {
+ initials = atts.getValue(i);
+ }
+ }
+ commentAuthors.add(id, name, initials);
+ //clear out
+ id = null; name = null; initials = null;
+ }
+ }
+
+ }
+
+
+ private static class PlaceHolderSkipper extends DefaultHandler {
+
+ private final ContentHandler wrappedHandler;
+
+ PlaceHolderSkipper(ContentHandler wrappedHandler) {
+ this.wrappedHandler = wrappedHandler;
+ }
+
+ boolean inPH = false;
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ if ("ph".equals(localName)) {
+ inPH = true;
+ }
+ if (! inPH) {
+ wrappedHandler.startElement(uri, localName, qName, atts);
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+
+ if (! inPH) {
+ wrappedHandler.endElement(uri, localName, qName);
+ }
+ if ("sp".equals(localName)) {
+ inPH = false;
+ }
+ }
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (! inPH) {
+ wrappedHandler.characters(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+ if (! inPH) {
+ wrappedHandler.characters(ch, start, length);
+ }
+ }
+
+
+ }
+
+ private class CommentAuthors {
+ Map<String, String> nameMap = new HashMap<>();
+ Map<String, String> initialMap = new HashMap<>();
+
+ void add(String id, String name, String initials) {
+ if (id == null) {
+ return;
+ }
+ if (name != null) {
+ nameMap.put(id, name);
+ }
+ if (initials != null) {
+ initialMap.put(id, initials);
+ }
+ }
+
+ String getName(String id) {
+ if (id == null) {
+ return null;
+ }
+ return nameMap.get(id);
+ }
+
+ String getInitials(String id) {
+ if (id == null) {
+ return null;
+ }
+ return initialMap.get(id);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/68161573/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index ce33c08..8f9fbf5 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -18,7 +18,7 @@ package org.apache.tika.parser.microsoft.ooxml;
import java.io.IOException;
import java.io.InputStream;
-import java.util.HashMap;
+import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@@ -34,11 +34,9 @@ import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
-import org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor;
-import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
-import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFDocumentXMLBodyHandler;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
-import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFTikaBodyPartHandler;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFNumberingShim;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -59,9 +57,28 @@ import org.xml.sax.SAXException;
public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
+ //include all parts that might have embedded objects
+ private final static String[] MAIN_PART_RELATIONS = new String[]{
+ XWPFRelation.HEADER.getRelation(),
+ XWPFRelation.FOOTER.getRelation(),
+ XWPFRelation.FOOTNOTE.getRelation(),
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes",
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"
+ };
+
+ //a docx file should have one of these "main story" parts
+ private final static String[] MAIN_STORY_PART_RELATIONS = new String[]{
+ XWPFRelation.DOCUMENT.getContentType(),
+ XWPFRelation.MACRO_DOCUMENT.getContentType(),
+ XWPFRelation.TEMPLATE.getContentType(),
+ XWPFRelation.MACRO_TEMPLATE_DOCUMENT.getContentType()
+
+ };
+
private final OPCPackage opcPackage;
private final ParseContext context;
+
public SXWPFWordExtractorDecorator(ParseContext context,
XWPFEventBasedWordExtractor extractor) {
super(context, extractor);
@@ -74,7 +91,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
protected void buildXHTML(XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
//handle main document
- List<PackagePart> pps = opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+ List<PackagePart> pps = getStoryDocumentParts();
if (pps != null) {
for (PackagePart pp : pps) {
//likely only one, but why not...
@@ -83,11 +100,15 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
//handle glossary document
pps = opcPackage.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
-
if (pps != null) {
- for (PackagePart pp : pps) {
- //likely only one, but why not...
- handleDocumentPart(pp, xhtml);
+ if (pps.size() > 0) {
+ xhtml.startElement("div", "class", "glossary");
+
+ for (PackagePart pp : pps) {
+ //likely only one, but why not...
+ handleDocumentPart(pp, xhtml);
+ }
+ xhtml.endElement("div");
}
}
}
@@ -95,8 +116,8 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
private void handleDocumentPart(PackagePart documentPart, XHTMLContentHandler xhtml) throws IOException, SAXException {
//load the numbering/list manager and styles from the main document part
XWPFNumbering numbering = loadNumbering(documentPart);
- XWPFListManager xwpfListManager = new XWPFListManager(numbering);
- //TODO: XWPFStyles styles = loadStyles(documentPart);
+ XWPFListManager listManager = new XWPFListManager(numbering);
+ XWPFStylesShim styles = loadStyles(documentPart);
//headers
try {
@@ -104,7 +125,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
if (headersPRC != null) {
for (int i = 0; i < headersPRC.size(); i++) {
PackagePart header = documentPart.getRelatedPart(headersPRC.getRelationship(i));
- handlePart(header, xwpfListManager, xhtml);
+ handlePart(header, styles, listManager, xhtml);
}
}
} catch (InvalidFormatException e) {
@@ -112,7 +133,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
//main document
- handlePart(documentPart, xwpfListManager, xhtml);
+ handlePart(documentPart, styles, listManager, xhtml);
//for now, just dump other components at end
for (XWPFRelation rel : new XWPFRelation[]{
@@ -126,7 +147,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
if (prc != null) {
for (int i = 0; i < prc.size(); i++) {
PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
- handlePart(packagePart, xwpfListManager, xhtml);
+ handlePart(packagePart, styles, listManager, xhtml);
}
}
} catch (InvalidFormatException e) {
@@ -135,44 +156,26 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
}
- private void handlePart(PackagePart packagePart,
- XWPFListManager xwpfListManager, XHTMLContentHandler xhtml) throws IOException, SAXException {
+ private void handlePart(PackagePart packagePart, XWPFStylesShim styles,
+ XWPFListManager listManager, XHTMLContentHandler xhtml) throws IOException, SAXException {
- Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
+ Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart, true);
try (InputStream stream = packagePart.getInputStream()) {
context.getSAXParser().parse(
new CloseShieldInputStream(stream),
new OfflineContentHandler(new EmbeddedContentHandler(
- new XWPFDocumentXMLBodyHandler(
- new XWPFTikaBodyPartHandler(xhtml, xwpfListManager,
- context.get(OfficeParserConfig.class)), hyperlinks))));
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml, styles, listManager,
+ context.get(OfficeParserConfig.class)), linkedRelationships))));
} catch (TikaException e) {
- e.printStackTrace();
+ //swallow
}
}
- private Map<String, String> loadHyperlinkRelationships(PackagePart bodyPart) {
- Map<String, String> hyperlinks = new HashMap<>();
- try {
- PackageRelationshipCollection prc = bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
- for (int i = 0; i < prc.size(); i++) {
- PackageRelationship pr = prc.getRelationship(i);
- if (pr == null) {
- continue;
- }
- String id = pr.getId();
- String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
- if (id != null && url != null) {
- hyperlinks.put(id, url);
- }
- }
- } catch (InvalidFormatException e) {
- }
- return hyperlinks;
- }
-/*
- private XWPFStyles loadStyles(PackagePart packagePart) {
+
+
+ private XWPFStylesShim loadStyles(PackagePart packagePart) {
try {
PackageRelationshipCollection stylesParts =
packagePart.getRelationshipsByType(XWPFRelation.STYLES.getRelation());
@@ -181,19 +184,20 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
if (stylesRelationShip == null) {
return null;
}
- PackagePart stylesPart = opcPackage.getPart(stylesRelationShip);
+ PackagePart stylesPart = packagePart.getRelatedPart(stylesRelationShip);
if (stylesPart == null) {
return null;
}
- return new XWPFStyles(stylesPart);
+
+ return new XWPFStylesShim(stylesPart, context);
}
- } catch (IOException|OpenXML4JException e) {
+ } catch (OpenXML4JException e) {
//swallow
}
return null;
}
-*/
+
private XWPFNumbering loadNumbering(PackagePart packagePart) {
try {
PackageRelationshipCollection numberingParts = packagePart.getRelationshipsByType(XWPFRelation.NUMBERING.getRelation());
@@ -202,11 +206,11 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
if (numberingRelationShip == null) {
return null;
}
- PackagePart numberingPart = opcPackage.getPart(numberingRelationShip);
+ PackagePart numberingPart = packagePart.getRelatedPart(numberingRelationShip);
if (numberingPart == null) {
return null;
}
- return new XWPFNumbering(numberingPart);
+ return new XWPFNumberingShim(numberingPart);
}
} catch (IOException | OpenXML4JException e) {
//swallow
@@ -215,10 +219,57 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
/**
- * This returns the main document only.
+ * This returns all items that might contain embedded objects:
+ * main document, headers, footers, comments, etc.
*/
@Override
protected List<PackagePart> getMainDocumentParts() {
- return opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+
+ List<PackagePart> mainStoryDocs = getStoryDocumentParts();
+ List<PackagePart> relatedParts = new ArrayList<>();
+
+ mainStoryDocs.addAll(
+ opcPackage.getPartsByContentType(
+ XWPFRelation.GLOSSARY_DOCUMENT.getContentType()));
+
+
+ for (PackagePart pp : mainStoryDocs) {
+ addRelatedParts(pp, relatedParts);
+ }
+ relatedParts.addAll(mainStoryDocs);
+ return relatedParts;
+ }
+
+ private void addRelatedParts(PackagePart documentPart, List<PackagePart> relatedParts) {
+ for (String relation : MAIN_PART_RELATIONS) {
+ PackageRelationshipCollection prc = null;
+ try {
+ prc = documentPart.getRelationshipsByType(relation);
+ if (prc != null) {
+ for (int i = 0; i < prc.size(); i++) {
+ PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
+ relatedParts.add(packagePart);
+ }
+ }
+ } catch (InvalidFormatException e) {
+ }
+ }
+
+ }
+
+ /**
+ *
+ * @return the first non-empty main story document part; empty list if no
+ * main story is found.
+ */
+ private List<PackagePart> getStoryDocumentParts() {
+
+ for (String contentType : MAIN_STORY_PART_RELATIONS) {
+ List<PackagePart> pps = opcPackage.getPartsByContentType(contentType);
+ if (pps.size() > 0) {
+ return pps;
+ }
+ }
+ return new ArrayList<>();
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/68161573/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
index 394c903..57653cb 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
@@ -31,7 +31,27 @@ import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.sl.usermodel.Placeholder;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
-import org.apache.poi.xslf.usermodel.*;
+import org.apache.poi.xslf.usermodel.XMLSlideShow;
+import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
+import org.apache.poi.xslf.usermodel.XSLFComments;
+import org.apache.poi.xslf.usermodel.XSLFGraphicFrame;
+import org.apache.poi.xslf.usermodel.XSLFGroupShape;
+import org.apache.poi.xslf.usermodel.XSLFHyperlink;
+import org.apache.poi.xslf.usermodel.XSLFNotes;
+import org.apache.poi.xslf.usermodel.XSLFNotesMaster;
+import org.apache.poi.xslf.usermodel.XSLFPictureShape;
+import org.apache.poi.xslf.usermodel.XSLFRelation;
+import org.apache.poi.xslf.usermodel.XSLFShape;
+import org.apache.poi.xslf.usermodel.XSLFSheet;
+import org.apache.poi.xslf.usermodel.XSLFSlide;
+import org.apache.poi.xslf.usermodel.XSLFSlideLayout;
+import org.apache.poi.xslf.usermodel.XSLFSlideShow;
+import org.apache.poi.xslf.usermodel.XSLFTable;
+import org.apache.poi.xslf.usermodel.XSLFTableCell;
+import org.apache.poi.xslf.usermodel.XSLFTableRow;
+import org.apache.poi.xslf.usermodel.XSLFTextParagraph;
+import org.apache.poi.xslf.usermodel.XSLFTextRun;
+import org.apache.poi.xslf.usermodel.XSLFTextShape;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -55,7 +75,7 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
}
/**
- * @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText()
+ * @see XSLFPowerPointExtractor#getText()
*/
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
@@ -148,6 +168,7 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
boolean inHyperlink = false;
for (XSLFTextParagraph p : txt.getTextParagraphs()) {
xhtml.startElement("p");
+
for (XSLFTextRun run : p.getTextRuns()) {
//TODO: add check for targetmode=external into POI
//then check to confirm that the urls are actually
@@ -219,7 +240,6 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
xhtml.startElement("table");
for (XSLFTableRow row : tbl) {
xhtml.startElement("tr");
- List<XSLFTableCell> cells = row.getCells();
for (XSLFTableCell c : row.getCells()) {
xhtml.startElement("td");
//TODO: Need to wait for fix in POI to test for hyperlink first
http://git-wip-us.apache.org/repos/asf/tika/blob/68161573/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 912469f..0f6957c 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -74,7 +74,6 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
private final DataFormatter formatter;
private final List<PackagePart> sheetParts = new ArrayList<PackagePart>();
private final Map<String, String> drawingHyperlinks = new HashMap<>();
-
private Metadata metadata;
private ParseContext parseContext;
@@ -161,6 +160,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
extractHeaderFooter(footer, xhtml);
}
processShapes(iter.getShapes(), xhtml);
+
//for now dump sheet hyperlinks at bottom of page
//consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes
//step 1: extract hyperlink info from bottom of page
@@ -171,6 +171,39 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
}
}
+ private void addDrawingHyperLinks(PackagePart sheetPart) {
+ try {
+ for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
+ if (rel.getTargetMode() == TargetMode.INTERNAL) {
+ PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
+ for (PackageRelationship drawRel : rel.getPackage()
+ .getPart(relName)
+ .getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
+ drawingHyperlinks.put(drawRel.getId(), drawRel.getTargetURI().toString());
+ }
+ }
+ }
+ } catch (InvalidFormatException e) {
+ //swallow
+ //an exception trying to extract
+ //hyperlinks on drawings should not cause a parse failure
+ }
+
+ }
+
+
+ private void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException {
+ try {
+ for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
+ xhtml.startElement("a", "href", rel.getTargetURI().toString());
+ xhtml.characters(rel.getTargetURI().toString());
+ xhtml.endElement("a");
+ }
+ } catch (InvalidFormatException e) {
+ //swallow
+ }
+ }
+
private void extractHeaderFooter(String hf, XHTMLContentHandler xhtml)
throws SAXException {
String content = ExcelExtractor._extractHeaderFooter(
@@ -232,39 +265,6 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
}
- private void addDrawingHyperLinks(PackagePart sheetPart) {
- try {
- for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
- if (rel.getTargetMode() == TargetMode.INTERNAL) {
- PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
- for (PackageRelationship drawRel : rel.getPackage()
- .getPart(relName)
- .getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
- drawingHyperlinks.put(drawRel.getId(), drawRel.getTargetURI().toString());
- }
- }
- }
- } catch (InvalidFormatException e) {
- //swallow
- //an exception trying to extract
- //hyperlinks on drawings should not cause a parse failure
- }
-
- }
-
-
- private void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException {
- try {
- for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
- xhtml.startElement("a", "href", rel.getTargetURI().toString());
- xhtml.characters(rel.getTargetURI().toString());
- xhtml.endElement("a");
- }
- } catch (InvalidFormatException e) {
- //swallow
- }
- }
-
public void processSheet(
SheetContentsHandler sheetContentsExtractor,
CommentsTable comments,
http://git-wip-us.apache.org/repos/asf/tika/blob/68161573/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
index d51f2e9..c8bcdc7 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
@@ -31,6 +31,12 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumLvl;
public class XWPFListManager extends AbstractListManager {
+
+ /**
+ * Empty singleton to be used when there is no list manager.
+ * Always returns empty string.
+ */
+ public final static XWPFListManager EMPTY_LIST = new EmptyListManager();
private final static boolean OVERRIDE_AVAILABLE;
private final static String SKIP_FORMAT = Character.toString((char) 61623);//if this shows up as the lvlText, don't show a number
@@ -66,6 +72,7 @@ public class XWPFListManager extends AbstractListManager {
if (numbering == null || iLvl < 0 || numId == null) {
return "";
}
+
int currNumId = numId.intValue();
XWPFNum xwpfNum = numbering.getNum(numId);
@@ -174,4 +181,21 @@ public class XWPFListManager extends AbstractListManager {
return new LevelTuple(start, restart, lvlText, numFmt, isLegal);
}
+
+ private static class EmptyListManager extends XWPFListManager {
+ EmptyListManager() {
+ super(null);
+ }
+
+ @Override
+ public String getFormattedNumber(XWPFParagraph paragraph) {
+ return "";
+ }
+
+ @Override
+ public String getFormattedNumber(BigInteger numId, int iLvl) {
+ return "";
+ }
+
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/68161573/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
new file mode 100644
index 0000000..3e98203
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xslf;
+
+import java.io.IOException;
+import java.util.Date;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLProperties;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
+import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
+import org.apache.tika.parser.microsoft.ooxml.RunProperties;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
+import org.apache.xmlbeans.XmlException;
+
+public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
+
+
+ private OPCPackage container;
+ private POIXMLProperties properties;
+
+ public XSLFEventBasedPowerPointExtractor(String path) throws XmlException, OpenXML4JException, IOException {
+ this(OPCPackage.open(path));
+ }
+
+ public XSLFEventBasedPowerPointExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
+ super((POIXMLDocument) null);
+ this.container = container;
+ this.properties = new POIXMLProperties(container);
+ }
+
+
+ public static void main(String[] args) throws Exception {
+ if (args.length < 1) {
+ System.err.println("Use:");
+ System.err.println(" XSLFEventBasedPowerPointExtractor <filename.pptx>");
+ System.exit(1);
+ }
+
+ XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(args[0]);
+ System.out.println(extractor.getText());
+ extractor.close();
+ }
+
+ public OPCPackage getPackage() {
+ return this.container;
+ }
+
+ public POIXMLProperties.CoreProperties getCoreProperties() {
+ return this.properties.getCoreProperties();
+ }
+
+ public POIXMLProperties.ExtendedProperties getExtendedProperties() {
+ return this.properties.getExtendedProperties();
+ }
+
+ public POIXMLProperties.CustomProperties getCustomProperties() {
+ return this.properties.getCustomProperties();
+ }
+
+
+ @Override
+ public String getText() {
+ //TODO
+ return "";
+ }
+
+
+
+ private class XSLFToTextContentHandler implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
+ private final StringBuilder buffer;
+
+ public XSLFToTextContentHandler(StringBuilder buffer) {
+ this.buffer = buffer;
+ }
+
+ @Override
+ public void run(RunProperties runProperties, String contents) {
+ buffer.append(contents);
+ }
+
+ @Override
+ public void hyperlinkStart(String link) {
+ //no-op
+ }
+
+ @Override
+ public void hyperlinkEnd() {
+ //no-op
+ }
+
+ @Override
+ public void startParagraph(ParagraphProperties paragraphProperties) {
+ //no-op
+ }
+
+ @Override
+ public void endParagraph() {
+ buffer.append("\n");
+ }
+
+ @Override
+ public void startTable() {
+
+ }
+
+ @Override
+ public void endTable() {
+
+ }
+
+ @Override
+ public void startTableRow() {
+
+ }
+
+ @Override
+ public void endTableRow() {
+ buffer.append("\n");
+ }
+
+ @Override
+ public void startTableCell() {
+
+ }
+
+ @Override
+ public void endTableCell() {
+ buffer.append("\t");
+ }
+
+ @Override
+ public void startSDT() {
+
+ }
+
+ @Override
+ public void endSDT() {
+
+ }
+
+ @Override
+ public void startEditedSection(String editor, Date date, OOXMLWordAndPowerPointTextHandler.EditType editType) {
+
+ }
+
+ @Override
+ public void endEditedSection() {
+
+ }
+
+ @Override
+ public boolean getIncludeDeletedText() {
+ return false;
+ }
+
+ @Override
+ public void footnoteReference(String id) {
+
+ }
+
+ @Override
+ public void endnoteReference(String id) {
+
+ }
+
+ @Override
+ public boolean getIncludeMoveFromText() {
+ return false;
+ }
+
+
+ @Override
+ public void embeddedOLERef(String refId) {
+ //no-op
+ }
+
+ @Override
+ public void embeddedPicRef(String picFileName, String picDescription) {
+ //no-op
+ }
+
+ @Override
+ public void startBookmark(String id, String name) {
+
+ }
+
+ @Override
+ public void endBookmark(String id) {
+
+ }
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/68161573/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 06ef951..86fb8f2 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -38,6 +38,9 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.util.SAXHelper;
import org.apache.poi.xwpf.usermodel.XWPFNumbering;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
+import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
+import org.apache.tika.parser.microsoft.ooxml.RunProperties;
import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
import org.apache.xmlbeans.XmlException;
import org.xml.sax.InputSource;
@@ -45,6 +48,7 @@ import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
//TODO: move this into POI?
+
/**
* Experimental class that is based on POI's XSSFEventBasedExcelExtractor
*
@@ -180,7 +184,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
try (InputStream stream = packagePart.getInputStream()) {
XMLReader reader = SAXHelper.newXMLReader();
- reader.setContentHandler(new XWPFDocumentXMLBodyHandler(
+ reader.setContentHandler(new OOXMLWordAndPowerPointTextHandler(
new XWPFToTextContentHandler(buffer), hyperlinks));
reader.parse(new InputSource(new CloseShieldInputStream(stream)));
@@ -209,29 +213,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
}
return hyperlinks;
}
-/*
- private XWPFStyles loadStyles(PackagePart packagePart) {
- try {
- PackageRelationshipCollection stylesParts =
- packagePart.getRelationshipsByType(XWPFRelation.STYLES.getRelation());
- if (stylesParts.size() > 0) {
- PackageRelationship stylesRelationShip = stylesParts.getRelationship(0);
- if (stylesRelationShip == null) {
- return null;
- }
- PackagePart stylesPart = opcPackage.getPart(stylesRelationShip);
- if (stylesPart == null) {
- return null;
- }
- return new XWPFStyles(stylesPart);
- }
- } catch (IOException|OpenXML4JException e) {
- //swallow
- }
- return null;
- }
-*/
private XWPFNumbering loadNumbering(PackagePart packagePart) {
try {
PackageRelationshipCollection numberingParts = packagePart.getRelationshipsByType(XWPFRelation.NUMBERING.getRelation());
@@ -252,7 +234,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
return null;
}
- private class XWPFToTextContentHandler implements XWPFDocumentXMLBodyHandler.XWPFBodyContentsHandler {
+ private class XWPFToTextContentHandler implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
private final StringBuilder buffer;
public XWPFToTextContentHandler(StringBuilder buffer) {
@@ -260,17 +242,22 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
}
@Override
- public void run(XWPFRunProperties runProperties, String contents) {
+ public void run(RunProperties runProperties, String contents) {
buffer.append(contents);
}
@Override
- public void hyperlinkRun(String link, String text) {
- buffer.append(" (").append(text).append(") ");
+ public void hyperlinkStart(String link) {
+ //no-op
+ }
+
+ @Override
+ public void hyperlinkEnd() {
+ //no-op
}
@Override
- public void startParagraph() {
+ public void startParagraph(ParagraphProperties paragraphProperties) {
//no-op
}
@@ -320,7 +307,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
}
@Override
- public void startEditedSection(String editor, Date date, XWPFDocumentXMLBodyHandler.EditType editType) {
+ public void startEditedSection(String editor, Date date, OOXMLWordAndPowerPointTextHandler.EditType editType) {
}
@@ -348,6 +335,26 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
public boolean getIncludeMoveFromText() {
return false;
}
+
+ @Override
+ public void embeddedOLERef(String refId) {
+ //no-op
+ }
+
+ @Override
+ public void embeddedPicRef(String picFileName, String picDescription) {
+ //no-op
+ }
+
+ @Override
+ public void startBookmark(String id, String name) {
+ //no-op
+ }
+
+ @Override
+ public void endBookmark(String id) {
+ //no-op
+ }
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/68161573/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFNumberingShim.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFNumberingShim.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFNumberingShim.java
new file mode 100644
index 0000000..2a07e3e
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFNumberingShim.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+import java.io.IOException;
+
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.xwpf.usermodel.XWPFNumbering;
+
+/**
+ * Stub class of POI's XWPFNumbering because onDocumentRead() is protected
+ */
+public class XWPFNumberingShim extends XWPFNumbering {
+
+ public XWPFNumberingShim(PackagePart part) throws IOException, OpenXML4JException {
+ super(part);
+ onDocumentRead();
+ }
+
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/68161573/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
new file mode 100644
index 0000000..471c235
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * For Tika, all we need (so far) is a mapping between styleId and a style's name.
+ *
+ * This class uses SAX to scrape that info out of the styles.xml file. If
+ * either the styleId or the style's name is null, no information is recorded.
+ */
+public class XWPFStylesShim {
+
+ /**
+ * Empty singleton to be used when there is no style info
+ */
+ public static XWPFStylesShim EMPTY_STYLES = new EmptyXWPFStyles();
+
+ private Map<String, String> styles = new HashMap<>();
+
+ private XWPFStylesShim() {
+
+ }
+
+ public XWPFStylesShim(PackagePart part, ParseContext parseContext) {
+ try (InputStream is = part.getInputStream()) {
+ onDocumentLoad(parseContext, is);
+ } catch (IOException|TikaException |SAXException e) {
+ //swallow
+ }
+ }
+
+ private void onDocumentLoad(ParseContext parseContext, InputStream stream) throws TikaException, IOException, SAXException {
+ parseContext.getSAXParser().parse(stream,
+ new OfflineContentHandler(new StylesStripper()));
+ }
+
+ /**
+ *
+ * @param styleId
+ * @return style's name or null if styleId is null or can't be found
+ */
+ public String getStyleName(String styleId) {
+ if (styleId == null) {
+ return null;
+ }
+ return styles.get(styleId);
+ }
+
+ private static class EmptyXWPFStyles extends XWPFStylesShim {
+
+ @Override
+ public String getStyleName(String styleId) {
+ return null;
+ }
+ }
+
+ private class StylesStripper extends DefaultHandler {
+
+ String currentStyleId = null;
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ if (uri == null || OOXMLWordAndPowerPointTextHandler.W_NS.equals(uri)) {
+ if ("style".equals(localName)) {
+ currentStyleId = atts.getValue(OOXMLWordAndPowerPointTextHandler.W_NS, "styleId");
+ } else if ("name".equals(localName)) {
+ String name = atts.getValue(OOXMLWordAndPowerPointTextHandler.W_NS, "val");
+ if (currentStyleId != null && name != null) {
+ styles.put(currentStyleId, name);
+ }
+ }
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ if (uri == null || OOXMLWordAndPowerPointTextHandler.W_NS.equals(uri)) {
+ if ("style".equals(localName)) {
+ currentStyleId = null;
+ }
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/68161573/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java
index 4276671..5b4853f 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java
@@ -54,36 +54,36 @@ class Word2006MLDocHandler extends DefaultHandler {
addPartHandler(new RelationshipsHandler(relationshipsManager));
- addPartHandler(new BodyPartHandler(
+ addPartHandler(new WordAndPowerPointTextPartHandler(
XWPFRelation.DOCUMENT.getContentType(),
xhtml, relationshipsManager, officeParserConfig));
- addPartHandler(new BodyPartHandler(
+ addPartHandler(new WordAndPowerPointTextPartHandler(
XWPFRelation.FOOTNOTE.getContentType(),
xhtml, relationshipsManager, officeParserConfig));
- addPartHandler(new BodyPartHandler(
+ addPartHandler(new WordAndPowerPointTextPartHandler(
"application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml",
xhtml, relationshipsManager, officeParserConfig));
- addPartHandler(new BodyPartHandler(
+ addPartHandler(new WordAndPowerPointTextPartHandler(
XWPFRelation.HEADER.getContentType(),
xhtml, relationshipsManager, officeParserConfig));
- addPartHandler(new BodyPartHandler(
+ addPartHandler(new WordAndPowerPointTextPartHandler(
XWPFRelation.FOOTER.getContentType(),
xhtml, relationshipsManager, officeParserConfig));
- addPartHandler(new BodyPartHandler(
+ addPartHandler(new WordAndPowerPointTextPartHandler(
"application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml",
xhtml, relationshipsManager, officeParserConfig));
- addPartHandler(new BodyPartHandler(
+ addPartHandler(new WordAndPowerPointTextPartHandler(
"application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml",
xhtml, relationshipsManager, officeParserConfig));
- addPartHandler(new BodyPartHandler(
+ addPartHandler(new WordAndPowerPointTextPartHandler(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml",
xhtml, relationshipsManager, officeParserConfig));
http://git-wip-us.apache.org/repos/asf/tika/blob/68161573/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/WordAndPowerPointTextPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/WordAndPowerPointTextPartHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/WordAndPowerPointTextPartHandler.java
new file mode 100644
index 0000000..ee18ef5
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/WordAndPowerPointTextPartHandler.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+import java.util.HashMap;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLTikaBodyPartHandler;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+
+/**
+ * Simple wrapper/extension of OOXMLWordAndPowerPointTextHandler to fit
+ * into the inline parsing scheme.
+ */
+class WordAndPowerPointTextPartHandler extends OOXMLWordAndPowerPointTextHandler implements PartHandler {
+
+ private final String contentType;
+ private String name;
+ public WordAndPowerPointTextPartHandler(String contentType, XHTMLContentHandler xhtml,
+ RelationshipsManager relationshipsManager,
+ OfficeParserConfig officeParserConfig) {
+ super(new OOXMLTikaBodyPartHandler(xhtml, null, null, officeParserConfig),
+ new HashMap<String, String>());
+ this.contentType = contentType;
+ }
+
+ @Override
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ @Override
+ public String getName() {
+ return name;
+ }
+
+ @Override
+ public String getContentType() {
+ return contentType;
+ }
+
+ @Override
+ public void endPart() throws SAXException, TikaException {
+ //no-op
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/68161573/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 94e23c1..de65555 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -49,7 +49,7 @@ public class ExcelParserTest extends TikaTest {
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
- XMLResult r = getXML("testEXCEL.xls", new OfficeParser(), new Metadata(), context);
+ XMLResult r = getXML("testEXCEL.xls", new OfficeParser(), context);
assertEquals(
"application/vnd.ms-excel",
@@ -81,7 +81,7 @@ public class ExcelParserTest extends TikaTest {
public void testExcelParserFormatting() throws Exception {
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
- XMLResult r = getXML("testEXCEL-formats.xls", new OfficeParser(), new Metadata(), context);
+ XMLResult r = getXML("testEXCEL-formats.xls", new OfficeParser(), context);
assertEquals(
"application/vnd.ms-excel",
@@ -161,7 +161,7 @@ public class ExcelParserTest extends TikaTest {
return "tika";
}
});
- XMLResult r = getXML("testEXCEL_protected_passtika.xls", new OfficeParser(), new Metadata(), context);
+ XMLResult r = getXML("testEXCEL_protected_passtika.xls", new OfficeParser(), context);
assertEquals(
"application/vnd.ms-excel",
@@ -346,7 +346,7 @@ public class ExcelParserTest extends TikaTest {
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
- XMLResult r = getXML("testEXCEL_custom_props.xls", new OfficeParser(), new Metadata(), context);
+ XMLResult r = getXML("testEXCEL_custom_props.xls", new OfficeParser(), context);
Metadata metadata = r.metadata;
assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("", metadata.get(TikaCoreProperties.CREATOR));
@@ -366,8 +366,7 @@ public class ExcelParserTest extends TikaTest {
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.UK);
- XMLResult r = getXML("testEXCEL_headers_footers.xls", new OfficeParser(),
- new Metadata(), context);
+ XMLResult r = getXML("testEXCEL_headers_footers.xls", new OfficeParser(), context);
Metadata metadata = r.metadata;
assertEquals(
http://git-wip-us.apache.org/repos/asf/tika/blob/68161573/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index e56462c..eec9f89 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.microsoft.ooxml;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import javax.xml.transform.OutputKeys;
@@ -37,6 +38,7 @@ import java.util.Locale;
import java.util.Map;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -428,7 +430,7 @@ public class OOXMLParserTest extends TikaTest {
// Links
assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
// Anchor links
- assertTrue(xml.contains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>"));
+ assertContains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>", xml);
// Paragraphs with other styles
assertTrue(xml.contains("<p class=\"signature\">This one"));
@@ -467,29 +469,42 @@ public class OOXMLParserTest extends TikaTest {
*/
@Test
public void testWordPicturesInHeader() throws Exception {
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
-
- StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
- TransformerHandler handler = factory.newTransformerHandler();
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
- handler.setResult(new StreamResult(sw));
+ List<Metadata> metadataList = getRecursiveMetadata("headerPic.docx");
+ assertEquals(2, metadataList.size());
+ Metadata m = metadataList.get(0);
+ String mainContent = m.get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ m.get(Metadata.CONTENT_TYPE));
+ // Check that custom headings came through
+ assertTrue(mainContent.contains("<img"));
+ }
- // Try with a document containing various tables and formattings
- try (InputStream input = getTestDocument("headerPic.docx")) {
- parser.parse(input, handler, metadata, context);
- String xml = sw.toString();
- assertEquals(
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- metadata.get(Metadata.CONTENT_TYPE));
- // Check that custom headings came through
- assertTrue(xml.contains("<img"));
+ @Test
+ @Ignore("need to add links in xhtml")
+ public void testPicturesInVariousPlaces() throws Exception {
+ //test that images are actually extracted from
+ //headers, footers, comments, endnotes, footnotes
+ List<Metadata> metadataList = getRecursiveMetadata("testWORD_embedded_pics.docx");
+
+ //only process embedded resources once
+ assertEquals(3, metadataList.size());
+ String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+ for (int i = 1; i < 4; i++) {
+ assertContains("header"+i+"_pic", content);
+ assertContains("footer"+i+"_pic", content);
}
+ assertContains("body_pic.jpg", content);
+ assertContains("sdt_pic.jpg", content);
+ assertContains("deeply_embedded_pic", content);
+ assertContains("deleted_pic", content);//TODO: don't extract this
+ assertContains("footnotes_pic", content);
+ assertContains("comments_pic", content);
+ assertContains("endnotes_pic", content);
+// assertContains("sdt2_pic.jpg", content);//name of file is not stored in image-sdt
+
+ assertContainsCount("<img src=", content, 14);
}
-
/**
* Documents with some sheets are protected, but not all.
* See TIKA-364.
@@ -983,7 +998,7 @@ public class OOXMLParserTest extends TikaTest {
//TIKA-792; with room for future missing bean tests
@Test
public void testWordMissingOOXMLBeans() throws Exception {
- //If a bean is missing, POI prints stack trace to stderr
+ //If a bean is missing, POI prints stack trace to stderr
String[] fileNames = new String[]{
"testWORD_missing_ooxml_bean1.docx",//TIKA-792
};
@@ -1218,22 +1233,6 @@ public class OOXMLParserTest extends TikaTest {
}
@Test
- public void testEmbeddedPDFInPPTX() throws Exception {
- List<Metadata> metadataList = getRecursiveMetadata("testPPT_embeddedPDF.pptx");
- Metadata pdfMetadata1 = metadataList.get(4);
- assertEquals("application/pdf", pdfMetadata1.get(Metadata.CONTENT_TYPE));
- Metadata pdfMetadata2 = metadataList.get(6);
- assertEquals("application/pdf", pdfMetadata2.get(Metadata.CONTENT_TYPE));
- }
-
- @Test
- public void testEmbeddedPDFInXLSX() throws Exception {
- List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_embeddedPDF.xls");
- Metadata pdfMetadata = metadataList.get(2);
- assertEquals("application/pdf", pdfMetadata.get(Metadata.CONTENT_TYPE));
- }
-
- @Test
public void testOrigSourcePath() throws Exception {
Metadata embed1_zip_metadata = getRecursiveMetadata("test_recursive_embedded.docx").get(11);
assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip",
@@ -1330,6 +1329,21 @@ public class OOXMLParserTest extends TikaTest {
System.out.println("elapsed: "+(new Date().getTime()-started) + " with " + ex + " exceptions");
}
+ @Test
+ @Ignore("until config is added to 2.x")
+ public void testInitializationViaConfig() throws Exception {
+ //NOTE: this test relies on a bug in the DOM extractor that
+ //is passing over the title information.
+ //once we fix that, this test will no longer be meaningful!
+ InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml");
+ assertNotNull(is);
+ TikaConfig tikaConfig = new TikaConfig(is);
+ AutoDetectParser p = new AutoDetectParser(tikaConfig);
+ XMLResult xml = getXML("testWORD_2006ml.docx", p, new Metadata());
+ assertContains("engaging title", xml.xml);
+
+ }
+
}