You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/12/01 18:57:06 UTC

svn commit: r1416029 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/ tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/ tika-parsers/src/test/resources/test-documents/

Author: mikemccand
Date: Sat Dec  1 17:57:05 2012
New Revision: 1416029

URL: http://svn.apache.org/viewvc?rev=1416029&view=rev
Log:
TIKA-1032: dedup relID by slideN_ for embedded files in .pptx

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded_two_slides.pptx   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1416029&r1=1416028&r2=1416029&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sat Dec  1 17:57:05 2012
@@ -37,7 +37,7 @@ Release 1.3 - Current Development
     occurred.  The id (rId) is included in the Metadata of each
     embedded document as the new Metadata.EMBEDDED_RELATIONSHIP_ID
     key, and TikaCLI prepends the rId (if present) onto the filename
-    it extracts (TIKA-997).
+    it extracts (TIKA-997, TIKA-1032).
 
   * MS PowerPoint (.ppt): When a PowerPoint (.ppt) document contains
     embedded files, Tika now places a <div class="embedded" id="XXX"/> into the

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1416029&r1=1416028&r2=1416029&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Sat Dec  1 17:57:05 2012
@@ -18,6 +18,7 @@ package org.apache.tika.parser.microsoft
 
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.net.URI;
 import java.util.List;
 
 import org.apache.poi.POIXMLDocument;
@@ -108,12 +109,38 @@ public abstract class AbstractOOXMLExtra
 
         xhtml.endDocument();
     }
+  
+    protected String getJustFileName(String desc) {
+      int idx = desc.lastIndexOf('/');
+      if (idx != -1) {
+        desc = desc.substring(idx+1);
+      }
+      idx = desc.lastIndexOf('.');
+      if (idx != -1) {
+        desc = desc.substring(0, idx);
+      }
+
+      return desc;
+    }
 
     private void handleEmbeddedParts(ContentHandler handler)
             throws TikaException, IOException, SAXException {
         try {
             for (PackagePart source : getMainDocumentParts()) {
                 for (PackageRelationship rel : source.getRelationships()) {
+
+                    URI sourceURI = rel.getSourceURI();
+                    String sourceDesc;
+                    if (sourceURI != null) {
+                        sourceDesc = getJustFileName(sourceURI.getPath());
+                        if (sourceDesc.startsWith("slide")) {
+                          sourceDesc += "_";
+                        } else {
+                          sourceDesc = "";
+                        }
+                    } else {
+                        sourceDesc = "";
+                    }
                     if (rel.getTargetMode() == TargetMode.INTERNAL) {
                         PackagePart target;
 
@@ -126,12 +153,12 @@ public abstract class AbstractOOXMLExtra
                         String type = rel.getRelationshipType();
                         if (RELATION_OLE_OBJECT.equals(type)
                                 && TYPE_OLE_OBJECT.equals(target.getContentType())) {
-                            handleEmbeddedOLE(target, handler, rel.getId());
+                            handleEmbeddedOLE(target, handler, sourceDesc + rel.getId());
                         } else if (RELATION_AUDIO.equals(type)
                                 || RELATION_IMAGE.equals(type)
                                 || RELATION_PACKAGE.equals(type)
                                 || RELATION_OLE_OBJECT.equals(type)) {
-                            handleEmbeddedFile(target, handler, rel.getId());
+                            handleEmbeddedFile(target, handler, sourceDesc + rel.getId());
                         }
                     }
                 }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1416029&r1=1416028&r2=1416029&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java Sat Dec  1 17:57:05 2012
@@ -67,25 +67,33 @@ public class XSLFPowerPointExtractorDeco
 
         XSLFSlide[] slides = slideShow.getSlides();
         for (XSLFSlide slide : slides) {
+            String slideDesc;
+            if (slide.getPackagePart() != null && slide.getPackagePart().getPartName() != null) {
+              slideDesc = getJustFileName(slide.getPackagePart().getPartName().toString());
+              slideDesc += "_";
+            } else {
+              slideDesc = null;
+            }
+
             // slide
-            extractContent(slide.getShapes(), false, xhtml);
+            extractContent(slide.getShapes(), false, xhtml, slideDesc);
 
             // slide layout which is the master sheet for this slide
             XSLFSheet slideLayout = slide.getMasterSheet();
-            extractContent(slideLayout.getShapes(), true, xhtml);
+            extractContent(slideLayout.getShapes(), true, xhtml, null);
 
             // slide master which is the master sheet for all text layouts
             XSLFSheet slideMaster = slideLayout.getMasterSheet();
-            extractContent(slideMaster.getShapes(), true, xhtml);
+            extractContent(slideMaster.getShapes(), true, xhtml, null);
 
             // notes (if present)
             XSLFSheet slideNotes = slide.getNotes();
             if (slideNotes != null) {
-                extractContent(slideNotes.getShapes(), false, xhtml);
+                extractContent(slideNotes.getShapes(), false, xhtml, slideDesc);
 
                 // master sheet for this notes
                 XSLFSheet notesMaster = slideNotes.getMasterSheet();
-                extractContent(notesMaster.getShapes(), true, xhtml);
+                extractContent(notesMaster.getShapes(), true, xhtml, null);
             }
 
             // comments (if present)
@@ -98,7 +106,7 @@ public class XSLFPowerPointExtractorDeco
         }
     }
 
-    private void extractContent(XSLFShape[] shapes, boolean skipPlaceholders, XHTMLContentHandler xhtml)
+    private void extractContent(XSLFShape[] shapes, boolean skipPlaceholders, XHTMLContentHandler xhtml, String slideDesc)
             throws SAXException {
         for (XSLFShape sh : shapes) {
             if (sh instanceof XSLFTextShape) {
@@ -111,12 +119,12 @@ public class XSLFPowerPointExtractorDeco
             } else if (sh instanceof XSLFGroupShape){
                 // recurse into groups of shapes
                 XSLFGroupShape group = (XSLFGroupShape)sh;
-                extractContent(group.getShapes(), skipPlaceholders, xhtml);
+                extractContent(group.getShapes(), skipPlaceholders, xhtml, slideDesc);
             } else if (sh instanceof XSLFTable) {
                 XSLFTable tbl = (XSLFTable)sh;
                 for(XSLFTableRow row : tbl){
                     List<XSLFTableCell> cells = row.getCells();
-                    extractContent(cells.toArray(new XSLFTableCell[cells.size()]), skipPlaceholders, xhtml);
+                    extractContent(cells.toArray(new XSLFTableCell[cells.size()]), skipPlaceholders, xhtml, slideDesc);
                 }
             } else if (sh instanceof XSLFGraphicFrame) {
                 XSLFGraphicFrame frame = (XSLFGraphicFrame) sh;
@@ -127,6 +135,9 @@ public class XSLFPowerPointExtractorDeco
                         XmlObject relIDAtt = emb.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"));
                         if (relIDAtt != null) {
                             String relID = relIDAtt.getDomNode().getNodeValue();
+                            if (slideDesc != null) {
+                              relID = slideDesc + relID;
+                            }
                             AttributesImpl attributes = new AttributesImpl();
                             attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                             attributes.addAttribute("", "id", "id", "CDATA", relID);
@@ -141,6 +152,9 @@ public class XSLFPowerPointExtractorDeco
                     if (ctPic.getBlipFill() != null && ctPic.getBlipFill().getBlip() != null) {
                         String relID = ctPic.getBlipFill().getBlip().getEmbed();
                         if (relID != null) {
+                            if (slideDesc != null) {
+                              relID = slideDesc + relID;
+                            }
                             AttributesImpl attributes = new AttributesImpl();
                             attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                             attributes.addAttribute("", "id", "id", "CDATA", relID);

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1416029&r1=1416028&r2=1416029&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Sat Dec  1 17:57:05 2012
@@ -855,9 +855,9 @@ public class OOXMLParserTest extends Tik
             input.close();
         }
         String xml = sw.toString();
-        int h = xml.indexOf("<div class=\"embedded\" id=\"rId3\"/>");
+        int h = xml.indexOf("<div class=\"embedded\" id=\"slide1_rId3\"/>");
         int i = xml.indexOf("Send me a note");
-        int j = xml.indexOf("<div class=\"embedded\" id=\"rId4\"/>");
+        int j = xml.indexOf("<div class=\"embedded\" id=\"slide2_rId4\"/>");
         int k = xml.indexOf("<p>No title</p>");
         assertTrue(h != -1);
         assertTrue(i != -1);
@@ -882,4 +882,11 @@ public class OOXMLParserTest extends Tik
         assertContains("This text is inside of a text box in the header of the document.", xml);
         assertContains("This text is inside of a text box in the footer of the document.", xml);
     }
-}
+
+    // TIKA-1032:
+    public void testEmbeddedPPTXTwoSlides() throws Exception {
+        String xml = getXML("testPPT_embedded_two_slides.pptx").xml;
+        assertContains("<div class=\"embedded\" id=\"slide1_rId7\"/>" , xml);
+        assertContains("<div class=\"embedded\" id=\"slide2_rId7\"/>" , xml);
+    }
+  }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded_two_slides.pptx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded_two_slides.pptx?rev=1416029&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded_two_slides.pptx
------------------------------------------------------------------------------
    svn:executable = *

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded_two_slides.pptx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream