You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/09/21 19:03:38 UTC
svn commit: r1173761 - in
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft:
HSLFExtractor.java ooxml/XSLFPowerPointExtractorDecorator.java
Author: nick
Date: Wed Sep 21 17:03:38 2011
New Revision: 1173761
URL: http://svn.apache.org/viewvc?rev=1173761&view=rev
Log:
TIKA-712 Fetch Master Slide text for PPT and PPTX text extraction
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1173761&r1=1173760&r2=1173761&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Wed Sep 21 17:03:38 2011
@@ -39,7 +39,12 @@ public class HSLFExtractor extends Abstr
throws IOException, SAXException, TikaException {
PowerPointExtractor powerPointExtractor =
new PowerPointExtractor(filesystem);
- xhtml.element("p", powerPointExtractor.getText(true, true));
+ powerPointExtractor.setSlidesByDefault(true);
+ powerPointExtractor.setNotesByDefault(true);
+ powerPointExtractor.setCommentsByDefault(true);
+ powerPointExtractor.setMasterByDefault(true);
+
+ xhtml.element("p", powerPointExtractor.getText());
List<OLEShape> shapeList = powerPointExtractor.getOLEShapes();
for (OLEShape shape : shapeList) {
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1173761&r1=1173760&r2=1173761&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java Wed Sep 21 17:03:38 2011
@@ -33,6 +33,7 @@ import org.apache.poi.xslf.usermodel.XML
import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xslf.usermodel.XSLFSlide;
+import org.apache.poi.xslf.usermodel.XSLFSlideMaster;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -77,20 +78,30 @@ public class XSLFPowerPointExtractorDeco
continue;
}
+ XSLFSlideMaster master = slide.getMasterSheet();
CTNotesSlide notes = rawSlideShow.getNotes(slideId);
CTCommentList comments = rawSlideShow.getSlideComments(slideId);
+ // TODO In POI 3.8 beta 5, improve how we get this
xhtml.startElement("div");
XSLFCommonSlideData common = new XSLFCommonSlideData(slide.getXmlObject().getCSld());
extractShapeContent(common, xhtml);
+ // If there are comments, extract them
if (comments != null) {
for (CTComment comment : comments.getCmArray()) {
xhtml.element("p", comment.getText());
}
}
+
+ // Get text from the master slide
+ if(master != null) {
+ // TODO In POI 3.8 beta 5, improve how we get this
+ extractShapeContent(new XSLFCommonSlideData(master.getXmlObject().getCSld()), xhtml);
+ }
if (notes != null) {
+ // TODO In POI 3.8 beta 5, improve how we get this
extractShapeContent(new XSLFCommonSlideData(notes.getCSld()), xhtml);
}
xhtml.endElement("div");