You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/12/01 18:57:06 UTC
svn commit: r1416029 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/
tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/
tika-parsers/src/test/resources/test-documents/
Author: mikemccand
Date: Sat Dec 1 17:57:05 2012
New Revision: 1416029
URL: http://svn.apache.org/viewvc?rev=1416029&view=rev
Log:
TIKA-1032: dedup relID by slideN_ for embedded files in .pptx
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded_two_slides.pptx (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1416029&r1=1416028&r2=1416029&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sat Dec 1 17:57:05 2012
@@ -37,7 +37,7 @@ Release 1.3 - Current Development
occurred. The id (rId) is included in the Metadata of each
embedded document as the new Metadata.EMBEDDED_RELATIONSHIP_ID
key, and TikaCLI prepends the rId (if present) onto the filename
- it extracts (TIKA-997).
+ it extracts (TIKA-997, TIKA-1032).
* MS PowerPoint (.ppt): When a PowerPoint (.ppt) document contains
embedded files, Tika now places a <div class="embedded" id="XXX"/> into the
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1416029&r1=1416028&r2=1416029&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Sat Dec 1 17:57:05 2012
@@ -18,6 +18,7 @@ package org.apache.tika.parser.microsoft
import java.io.FileNotFoundException;
import java.io.IOException;
+import java.net.URI;
import java.util.List;
import org.apache.poi.POIXMLDocument;
@@ -108,12 +109,38 @@ public abstract class AbstractOOXMLExtra
xhtml.endDocument();
}
+
+ protected String getJustFileName(String desc) {
+ int idx = desc.lastIndexOf('/');
+ if (idx != -1) {
+ desc = desc.substring(idx+1);
+ }
+ idx = desc.lastIndexOf('.');
+ if (idx != -1) {
+ desc = desc.substring(0, idx);
+ }
+
+ return desc;
+ }
private void handleEmbeddedParts(ContentHandler handler)
throws TikaException, IOException, SAXException {
try {
for (PackagePart source : getMainDocumentParts()) {
for (PackageRelationship rel : source.getRelationships()) {
+
+ URI sourceURI = rel.getSourceURI();
+ String sourceDesc;
+ if (sourceURI != null) {
+ sourceDesc = getJustFileName(sourceURI.getPath());
+ if (sourceDesc.startsWith("slide")) {
+ sourceDesc += "_";
+ } else {
+ sourceDesc = "";
+ }
+ } else {
+ sourceDesc = "";
+ }
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePart target;
@@ -126,12 +153,12 @@ public abstract class AbstractOOXMLExtra
String type = rel.getRelationshipType();
if (RELATION_OLE_OBJECT.equals(type)
&& TYPE_OLE_OBJECT.equals(target.getContentType())) {
- handleEmbeddedOLE(target, handler, rel.getId());
+ handleEmbeddedOLE(target, handler, sourceDesc + rel.getId());
} else if (RELATION_AUDIO.equals(type)
|| RELATION_IMAGE.equals(type)
|| RELATION_PACKAGE.equals(type)
|| RELATION_OLE_OBJECT.equals(type)) {
- handleEmbeddedFile(target, handler, rel.getId());
+ handleEmbeddedFile(target, handler, sourceDesc + rel.getId());
}
}
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1416029&r1=1416028&r2=1416029&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java Sat Dec 1 17:57:05 2012
@@ -67,25 +67,33 @@ public class XSLFPowerPointExtractorDeco
XSLFSlide[] slides = slideShow.getSlides();
for (XSLFSlide slide : slides) {
+ String slideDesc;
+ if (slide.getPackagePart() != null && slide.getPackagePart().getPartName() != null) {
+ slideDesc = getJustFileName(slide.getPackagePart().getPartName().toString());
+ slideDesc += "_";
+ } else {
+ slideDesc = null;
+ }
+
// slide
- extractContent(slide.getShapes(), false, xhtml);
+ extractContent(slide.getShapes(), false, xhtml, slideDesc);
// slide layout which is the master sheet for this slide
XSLFSheet slideLayout = slide.getMasterSheet();
- extractContent(slideLayout.getShapes(), true, xhtml);
+ extractContent(slideLayout.getShapes(), true, xhtml, null);
// slide master which is the master sheet for all text layouts
XSLFSheet slideMaster = slideLayout.getMasterSheet();
- extractContent(slideMaster.getShapes(), true, xhtml);
+ extractContent(slideMaster.getShapes(), true, xhtml, null);
// notes (if present)
XSLFSheet slideNotes = slide.getNotes();
if (slideNotes != null) {
- extractContent(slideNotes.getShapes(), false, xhtml);
+ extractContent(slideNotes.getShapes(), false, xhtml, slideDesc);
// master sheet for this notes
XSLFSheet notesMaster = slideNotes.getMasterSheet();
- extractContent(notesMaster.getShapes(), true, xhtml);
+ extractContent(notesMaster.getShapes(), true, xhtml, null);
}
// comments (if present)
@@ -98,7 +106,7 @@ public class XSLFPowerPointExtractorDeco
}
}
- private void extractContent(XSLFShape[] shapes, boolean skipPlaceholders, XHTMLContentHandler xhtml)
+ private void extractContent(XSLFShape[] shapes, boolean skipPlaceholders, XHTMLContentHandler xhtml, String slideDesc)
throws SAXException {
for (XSLFShape sh : shapes) {
if (sh instanceof XSLFTextShape) {
@@ -111,12 +119,12 @@ public class XSLFPowerPointExtractorDeco
} else if (sh instanceof XSLFGroupShape){
// recurse into groups of shapes
XSLFGroupShape group = (XSLFGroupShape)sh;
- extractContent(group.getShapes(), skipPlaceholders, xhtml);
+ extractContent(group.getShapes(), skipPlaceholders, xhtml, slideDesc);
} else if (sh instanceof XSLFTable) {
XSLFTable tbl = (XSLFTable)sh;
for(XSLFTableRow row : tbl){
List<XSLFTableCell> cells = row.getCells();
- extractContent(cells.toArray(new XSLFTableCell[cells.size()]), skipPlaceholders, xhtml);
+ extractContent(cells.toArray(new XSLFTableCell[cells.size()]), skipPlaceholders, xhtml, slideDesc);
}
} else if (sh instanceof XSLFGraphicFrame) {
XSLFGraphicFrame frame = (XSLFGraphicFrame) sh;
@@ -127,6 +135,9 @@ public class XSLFPowerPointExtractorDeco
XmlObject relIDAtt = emb.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"));
if (relIDAtt != null) {
String relID = relIDAtt.getDomNode().getNodeValue();
+ if (slideDesc != null) {
+ relID = slideDesc + relID;
+ }
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", relID);
@@ -141,6 +152,9 @@ public class XSLFPowerPointExtractorDeco
if (ctPic.getBlipFill() != null && ctPic.getBlipFill().getBlip() != null) {
String relID = ctPic.getBlipFill().getBlip().getEmbed();
if (relID != null) {
+ if (slideDesc != null) {
+ relID = slideDesc + relID;
+ }
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", relID);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1416029&r1=1416028&r2=1416029&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Sat Dec 1 17:57:05 2012
@@ -855,9 +855,9 @@ public class OOXMLParserTest extends Tik
input.close();
}
String xml = sw.toString();
- int h = xml.indexOf("<div class=\"embedded\" id=\"rId3\"/>");
+ int h = xml.indexOf("<div class=\"embedded\" id=\"slide1_rId3\"/>");
int i = xml.indexOf("Send me a note");
- int j = xml.indexOf("<div class=\"embedded\" id=\"rId4\"/>");
+ int j = xml.indexOf("<div class=\"embedded\" id=\"slide2_rId4\"/>");
int k = xml.indexOf("<p>No title</p>");
assertTrue(h != -1);
assertTrue(i != -1);
@@ -882,4 +882,11 @@ public class OOXMLParserTest extends Tik
assertContains("This text is inside of a text box in the header of the document.", xml);
assertContains("This text is inside of a text box in the footer of the document.", xml);
}
-}
+
+ // TIKA-1032:
+ public void testEmbeddedPPTXTwoSlides() throws Exception {
+ String xml = getXML("testPPT_embedded_two_slides.pptx").xml;
+ assertContains("<div class=\"embedded\" id=\"slide1_rId7\"/>" , xml);
+ assertContains("<div class=\"embedded\" id=\"slide2_rId7\"/>" , xml);
+ }
+ }
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded_two_slides.pptx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded_two_slides.pptx?rev=1416029&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded_two_slides.pptx
------------------------------------------------------------------------------
svn:executable = *
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded_two_slides.pptx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream