You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2013/09/27 18:18:55 UTC
svn commit: r1526975 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
Author: tallison
Date: Fri Sep 27 16:18:54 2013
New Revision: 1526975
URL: http://svn.apache.org/r1526975
Log:
TIKA-1076 extract text from tables in ppt.
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1526975&r1=1526974&r2=1526975&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Fri Sep 27 16:18:54 2013
@@ -85,6 +85,13 @@ public class HSLFExtractor extends Abstr
xhtml.endElement("p");
}
+ // Table text
+ for (Shape shape: slide.getShapes()){
+ if (shape instanceof Table){
+ extractTableText(xhtml, (Table)shape);
+ }
+ }
+
// Slide footer, if present
if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
xhtml.startElement("p", "class", "slide-footer");
@@ -163,6 +170,24 @@ public class HSLFExtractor extends Abstr
xhtml.endElement("div");
}
+ private void extractTableText(XHTMLContentHandler xhtml, Table shape) throws SAXException {
+ xhtml.startElement("table");
+ for (int row = 0; row < shape.getNumberOfRows(); row++){
+ xhtml.startElement("tr");
+ for (int col = 0; col < shape.getNumberOfColumns(); col++){
+ TableCell cell = shape.getCell(row, col);
+ //insert empty string for empty cell if cell is null
+ String txt = "";
+ if (cell != null){
+ txt = cell.getText();
+ }
+ xhtml.element("td", txt);
+ }
+ xhtml.endElement("tr");
+ }
+ xhtml.endElement("table");
+ }
+
private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs, boolean isMaster) throws SAXException {
if (runs==null) {
return;
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1526975&r1=1526974&r2=1526975&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java Fri Sep 27 16:18:54 2013
@@ -79,8 +79,7 @@ public class PowerPointParserTest extend
assertContains("Here is a citation:", content);
assertContains("Figure 1 This is a caption for Figure 1", content);
assertContains("(Kramer)", content);
- // TODO Work out why the upgrade to POI 3.9 broke this test (table text)
-// assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
+ assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
assertContains("This is a hyperlink", content);
assertContains("Here is a list:", content);