You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2013/09/27 18:18:55 UTC

svn commit: r1526975 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java

Author: tallison
Date: Fri Sep 27 16:18:54 2013
New Revision: 1526975

URL: http://svn.apache.org/r1526975
Log:
TIKA-1076 extract text from tables in ppt.

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1526975&r1=1526974&r2=1526975&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Fri Sep 27 16:18:54 2013
@@ -85,6 +85,13 @@ public class HSLFExtractor extends Abstr
             xhtml.endElement("p");
          }
 
+         // Table text
+         for (Shape shape: slide.getShapes()){
+            if (shape instanceof Table){
+               extractTableText(xhtml, (Table)shape);
+            }
+         }
+
          // Slide footer, if present
          if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
             xhtml.startElement("p", "class", "slide-footer");
@@ -163,6 +170,24 @@ public class HSLFExtractor extends Abstr
       xhtml.endElement("div");
    }
 
+   private void extractTableText(XHTMLContentHandler xhtml, Table shape) throws SAXException {
+      xhtml.startElement("table");
+      for (int row = 0; row < shape.getNumberOfRows(); row++){
+         xhtml.startElement("tr");
+         for (int col = 0; col < shape.getNumberOfColumns(); col++){
+            TableCell cell = shape.getCell(row, col);
+            //insert empty string for empty cell if cell is null
+            String txt = "";
+            if (cell != null){
+               txt = cell.getText();
+            }
+            xhtml.element("td", txt);
+         }
+         xhtml.endElement("tr");
+      }
+      xhtml.endElement("table");   
+   }
+
    private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs, boolean isMaster) throws SAXException {
       if (runs==null) {
          return;

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1526975&r1=1526974&r2=1526975&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java Fri Sep 27 16:18:54 2013
@@ -79,8 +79,7 @@ public class PowerPointParserTest extend
         assertContains("Here is a citation:", content);
         assertContains("Figure 1 This is a caption for Figure 1", content);
         assertContains("(Kramer)", content);
-        // TODO Work out why the upgrade to POI 3.9 broke this test (table text)
-//        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
+        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
         assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
         assertContains("This is a hyperlink", content);
         assertContains("Here is a list:", content);