You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/11/18 17:10:24 UTC

svn commit: r1410914 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/java/org/apache/tika/ tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/java/org/apache/tika/p...

Author: mikemccand
Date: Sun Nov 18 16:10:23 2012
New Revision: 1410914

URL: http://svn.apache.org/viewvc?rev=1410914&view=rev
Log:
TIKA-1025: leave placeholder where embedded docs appear in .ppt extraction

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded2.ppt   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1410914&r1=1410913&r2=1410914&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Nov 18 16:10:23 2012
@@ -39,6 +39,11 @@ Release 1.3 - Current Development
     key, and TikaCLI prepends the rId (if present) onto the filename
     it extracts (TIKA-997).
 
+  * MS PowerPoint (.ppt): When a PowerPoint (.ppt) document contains
+    embedded files, Tika now places a <div class="embedded" id="XXX"/> into the
+    XHTML so you can see where in the main text the embedded document
+    occurred (TIKA-1025).
+
   * MHTML: fixed Null charset name exception when a mime part has an
     unrecognized charset (TIKA-1011).
 

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1410914&r1=1410913&r2=1410914&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Sun Nov 18 16:10:23 2012
@@ -16,6 +16,9 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import java.io.IOException;
+import java.util.HashSet;
+
 import org.apache.poi.hslf.HSLFSlideShow;
 import org.apache.poi.hslf.model.*;
 import org.apache.poi.hslf.usermodel.ObjectData;
@@ -28,9 +31,7 @@ import org.apache.tika.io.TikaInputStrea
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
-
-import java.io.IOException;
-import java.util.HashSet;
+import org.xml.sax.helpers.AttributesImpl;
 
 public class HSLFExtractor extends AbstractPOIFSExtractor {
    public HSLFExtractor(ParseContext context) {
@@ -221,27 +222,39 @@ public class HSLFExtractor extends Abstr
       for( Shape shape : shapes ) {
          if( shape instanceof OLEShape ) {
             OLEShape oleShape = (OLEShape)shape;
-            
+            ObjectData data = null;
             try {
-               ObjectData data = oleShape.getObjectData();
-
-               if(data != null) {
-                  TikaInputStream stream =
-                     TikaInputStream.get(data.getData());
-                  try {
-                     String mediaType = null;
-                     if ("Excel.Chart.8".equals(oleShape.getProgID())) {
-                        mediaType = "application/vnd.ms-excel";
-                     }
-                     handleEmbeddedResource(
-                           stream, Integer.toString(oleShape.getObjectID()), null,
-                           mediaType, xhtml, false);
-                  } finally {
-                     stream.close();
+                data = oleShape.getObjectData();
+            } catch( NullPointerException e ) { 
+                /* getObjectData throws NPE some times. */
+            }
+ 
+            if (data != null) {
+               String objID = Integer.toString(oleShape.getObjectID());
+
+               // Embedded Object: add a <div
+               // class="embedded" id="X"/> so consumer can see where
+               // in the main text each embedded document
+               // occurred:
+               AttributesImpl attributes = new AttributesImpl();
+               attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+               attributes.addAttribute("", "id", "id", "CDATA", objID);
+               xhtml.startElement("div", attributes);
+               xhtml.endElement("div");
+
+               TikaInputStream stream =
+                    TikaInputStream.get(data.getData());
+               try {
+                  String mediaType = null;
+                  if ("Excel.Chart.8".equals(oleShape.getProgID())) {
+                     mediaType = "application/vnd.ms-excel";
                   }
+                  handleEmbeddedResource(
+                        stream, objID, objID,
+                        mediaType, xhtml, false);
+               } finally {
+                  stream.close();
                }
-            } catch( NullPointerException e ) { 
-               /* getObjectData throws NPE some times. */
             }
          }
       }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java?rev=1410914&r1=1410913&r2=1410914&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Sun Nov 18 16:10:23 2012
@@ -18,11 +18,21 @@ package org.apache.tika;
 
 import java.io.File;
 import java.io.InputStream;
+import java.io.StringWriter;
 import java.net.URISyntaxException;
 import java.net.URL;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
 
 import junit.framework.TestCase;
 
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+
 /**
  * Parent class of Tika tests
  */
@@ -64,4 +74,41 @@ public abstract class TikaTest extends T
     public void assertContains(String needle, String haystack) {
        assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
     }
+
+    protected static class XMLResult {
+        public final String xml;
+        public final Metadata metadata;
+
+        public XMLResult(String xml, Metadata metadata) {
+            this.xml = xml;
+            this.metadata = metadata;
+        }
+    }
+
+    protected XMLResult getXML(String filePath) throws Exception {
+        InputStream input = null;
+        Metadata metadata = new Metadata();
+        Parser parser = new AutoDetectParser();
+        
+        StringWriter sw = new StringWriter();
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+                 SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+        handler.setResult(new StreamResult(sw));
+
+        ParseContext context = new ParseContext();
+        context.set(Parser.class, parser);
+
+        // Try with a document containing various tables and formattings
+        input = getResourceAsStream("/test-documents/" + filePath);
+        try {
+            parser.parse(input, handler, metadata, context);
+            return new XMLResult(sw.toString(), metadata);
+        } finally {
+            input.close();
+        }
+    }
+
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1410914&r1=1410913&r2=1410914&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java Sun Nov 18 16:10:23 2012
@@ -220,4 +220,11 @@ public class PowerPointParserTest extend
        assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
        assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
     }
+
+    // TIKA-1025
+    public void testEmbeddedPlacedholder() throws Exception {
+       XMLResult result = getXML("testPPT_embedded2.ppt");
+       assertContains("<div class=\"embedded\" id=\"1\"/>", result.xml);
+       assertContains("<div class=\"embedded\" id=\"14\"/>", result.xml);
+    }
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1410914&r1=1410913&r2=1410914&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Sun Nov 18 16:10:23 2012
@@ -17,14 +17,8 @@
 package org.apache.tika.parser.microsoft;
 
 import java.io.InputStream;
-import java.io.StringWriter;
 import java.util.Locale;
 
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
-
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
@@ -32,7 +26,6 @@ import org.apache.tika.metadata.OfficeOp
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
 
@@ -72,38 +65,6 @@ public class WordParserTest extends Tika
         }
     }
 
-    private static class XMLResult {
-        public final String xml;
-        public final Metadata metadata;
-
-        public XMLResult(String xml, Metadata metadata) {
-            this.xml = xml;
-            this.metadata = metadata;
-      }
-    }
-
-    private XMLResult getXML(String filePath) throws Exception {
-        InputStream input = null;
-        Metadata metadata = new Metadata();
-        
-        StringWriter sw = new StringWriter();
-        SAXTransformerFactory factory = (SAXTransformerFactory)
-                 SAXTransformerFactory.newInstance();
-        TransformerHandler handler = factory.newTransformerHandler();
-        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
-        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
-        handler.setResult(new StreamResult(sw));
-
-        // Try with a document containing various tables and formattings
-        input = OOXMLParserTest.class.getResourceAsStream(filePath);
-        try {
-            new OfficeParser().parse(input, handler, metadata, new ParseContext());
-            return new XMLResult(sw.toString(), metadata);
-        } finally {
-            input.close();
-        }
-    }
-
     /**
      * Test that the word converter is able to generate the
      *  correct HTML for the document
@@ -112,7 +73,7 @@ public class WordParserTest extends Tika
 
         // Try with a document containing various tables and
         // formattings
-        XMLResult result = getXML("/test-documents/testWORD.doc");
+        XMLResult result = getXML("testWORD.doc");
         String xml = result.xml;
         Metadata metadata = result.metadata;
 
@@ -142,7 +103,7 @@ public class WordParserTest extends Tika
         assertTrue(xml.contains("<p class=\"signature\">This one"));
         
         // Try with a document that contains images
-        xml = getXML("/test-documents/testWORD_3imgs.doc").xml;
+        xml = getXML("testWORD_3imgs.doc").xml;
 
         // Images 1-3
         assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image1.png\""));
@@ -154,7 +115,7 @@ public class WordParserTest extends Tika
 
         // TIKA-692: test document containing multiple
         // character runs within a bold tag:
-        xml = getXML("/test-documents/testWORD_bold_character_runs.doc").xml;
+        xml = getXML("testWORD_bold_character_runs.doc").xml;
 
         // Make sure bold text arrived as single
         // contiguous string even though Word parser
@@ -163,7 +124,7 @@ public class WordParserTest extends Tika
 
         // TIKA-692: test document containing multiple
         // character runs within a bold tag:
-        xml = getXML("/test-documents/testWORD_bold_character_runs2.doc").xml;
+        xml = getXML("testWORD_bold_character_runs2.doc").xml;
             
         // Make sure bold text arrived as single
         // contiguous string even though Word parser
@@ -172,7 +133,7 @@ public class WordParserTest extends Tika
     }
 
     public void testEmbeddedNames() throws Exception {
-        String result = getXML("/test-documents/testWORD_embedded_pdf.doc").xml;
+        String result = getXML("testWORD_embedded_pdf.doc").xml;
 
         // Make sure the embedded div comes out after "Here
         // is the pdf file" and before "Bye Bye":
@@ -189,14 +150,14 @@ public class WordParserTest extends Tika
 
     // TIKA-982
     public void testEmbeddedRTF() throws Exception {
-        String result = getXML("/test-documents/testWORD_embedded_rtf.doc").xml;
+        String result = getXML("testWORD_embedded_rtf.doc").xml;
         assertTrue(result.indexOf("<div class=\"embedded\" id=\"_1404039792\"/>") != -1);
         assertTrue(result.indexOf("_1404039792.rtf") != -1);
     }
 
     // TIKA-1019
     public void testDocumentLink() throws Exception {
-        String result = getXML("/test-documents/testDocumentLink.doc").xml;
+        String result = getXML("testDocumentLink.doc").xml;
         assertTrue(result.indexOf("<div class=\"embedded\" id=\"_1327495610\"/>") != -1);
         assertTrue(result.indexOf("_1327495610.unknown") != -1);
     }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1410914&r1=1410913&r2=1410914&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Sun Nov 18 16:10:23 2012
@@ -308,36 +308,6 @@ public class OOXMLParserTest extends Tik
         }
     }
 
-    private static class XMLResult {
-        public final String xml;
-        public final Metadata metadata;
-
-        public XMLResult(String xml, Metadata metadata) {
-            this.xml = xml;
-            this.metadata = metadata;
-      }
-    }
-
-    private XMLResult getXML(String name) throws Exception {
-        StringWriter sw = new StringWriter();
-        SAXTransformerFactory factory = (SAXTransformerFactory)
-                 SAXTransformerFactory.newInstance();
-        TransformerHandler handler = factory.newTransformerHandler();
-        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
-        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
-        handler.setResult(new StreamResult(sw));
-
-        // Try with a document containing various tables and formattings
-        InputStream input = getTestDocument(name);
-        try {
-            Metadata metadata = new Metadata();
-            parser.parse(input, handler, metadata, new ParseContext());
-            return new XMLResult(sw.toString(), metadata);
-        } finally {
-            input.close();
-        }
-    }
-
     /**
      * Test that the word converter is able to generate the
      *  correct HTML for the document

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1410914&r1=1410913&r2=1410914&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Sun Nov 18 16:10:23 2012
@@ -451,37 +451,4 @@ public class PDFParserTest extends TikaT
         // Column text is now interleaved:
         assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content);
     }
-
-    private static class XMLResult {
-        public final String xml;
-        public final Metadata metadata;
-
-        public XMLResult(String xml, Metadata metadata) {
-            this.xml = xml;
-            this.metadata = metadata;
-      }
-    }
-
-    private XMLResult getXML(String filename) throws Exception {
-        Metadata metadata = new Metadata();
-        Parser parser = new AutoDetectParser(); // Should auto-detect!        
-        StringWriter sw = new StringWriter();
-        SAXTransformerFactory factory = (SAXTransformerFactory)
-                 SAXTransformerFactory.newInstance();
-        TransformerHandler handler = factory.newTransformerHandler();
-        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
-        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
-        handler.setResult(new StreamResult(sw));
-
-        ParseContext context = new ParseContext();
-        context.set(Parser.class, parser);
-        // Try with a document containing various tables and formattings
-        InputStream input = getResourceAsStream("/test-documents/" + filename);
-        try {
-            parser.parse(input, handler, metadata, context);
-            return new XMLResult(sw.toString(), metadata);
-        } finally {
-            input.close();
-        }
-    }
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1410914&r1=1410913&r2=1410914&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Sun Nov 18 16:10:23 2012
@@ -343,37 +343,6 @@ public class RTFParserTest extends TikaT
         return new Result(content, metadata);
     }
 
-    private static class XMLResult {
-        public final String xml;
-        public final Metadata metadata;
-
-        public XMLResult(String xml, Metadata metadata) {
-            this.xml = xml;
-            this.metadata = metadata;
-      }
-    }
-
-    private XMLResult getXML(String filename) throws Exception {
-        Metadata metadata = new Metadata();
-        
-        StringWriter sw = new StringWriter();
-        SAXTransformerFactory factory = (SAXTransformerFactory)
-                 SAXTransformerFactory.newInstance();
-        TransformerHandler handler = factory.newTransformerHandler();
-        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
-        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
-        handler.setResult(new StreamResult(sw));
-
-        // Try with a document containing various tables and formattings
-        InputStream input = getResourceAsStream("/test-documents/" + filename);
-        try {
-            tika.getParser().parse(input, handler, metadata, new ParseContext());
-            return new XMLResult(sw.toString(), metadata);
-        } finally {
-            input.close();
-        }
-    }
-
     private String getText(String filename) throws Exception {
         return getResult(filename).text;
     }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded2.ppt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded2.ppt?rev=1410914&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded2.ppt
------------------------------------------------------------------------------
    svn:executable = *

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPPT_embedded2.ppt
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream