You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/09/29 16:12:21 UTC

svn commit: r1177313 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java

Author: nick
Date: Thu Sep 29 14:12:21 2011
New Revision: 1177313

URL: http://svn.apache.org/viewvc?rev=1177313&view=rev
Log:
HSLF Extractor improvements from Pablo from TIKA-727

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1177313&r1=1177312&r2=1177313&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Thu Sep 29 14:12:21 2011
@@ -22,7 +22,6 @@ import java.util.HashSet;
 import org.apache.poi.hslf.HSLFSlideShow;
 import org.apache.poi.hslf.model.Comment;
 import org.apache.poi.hslf.model.HeadersFooters;
-import org.apache.poi.hslf.model.MasterSheet;
 import org.apache.poi.hslf.model.Notes;
 import org.apache.poi.hslf.model.OLEShape;
 import org.apache.poi.hslf.model.Shape;
@@ -49,7 +48,7 @@ public class HSLFExtractor extends Abstr
       SlideShow _show = new SlideShow(ss);
       Slide[] _slides = _show.getSlides();
 
-      xhtml.startElement("div", "style", "slideShow");
+      xhtml.startElement("div", "class", "slideShow");
 
       /* Iterate over slides and extract text */
       for( Slide slide : _slides ) {
@@ -97,11 +96,18 @@ public class HSLFExtractor extends Abstr
          // Comments, if present
          for( Comment comment : slide.getComments() ) {
             xhtml.startElement("p", "class", "slide-comment");
-            xhtml.startElement("b");
-            xhtml.characters( comment.getAuthor() );
-            xhtml.endElement("b");
-            xhtml.characters( "&nbsp-&nbsp");
-            xhtml.characters( comment.getText() );
+            if (comment.getAuthor() != null) {
+               xhtml.startElement("b");
+               xhtml.characters( comment.getAuthor() );
+               xhtml.endElement("b");
+               
+               if (comment.getText() != null) {
+                  xhtml.characters( " - ");
+               }
+            }
+            if (comment.getText() != null) {
+               xhtml.characters( comment.getText() );
+            }
             xhtml.endElement("p");
          }
 
@@ -136,7 +142,7 @@ public class HSLFExtractor extends Abstr
          // Repeat the Notes header, if set
          if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) {
             xhtml.startElement("p", "class", "slide-note-header");
-            xhtml.characters( hf.getFooterText() );
+            xhtml.characters( hf.getHeaderText() );
             xhtml.endElement("p");
          }
 
@@ -170,7 +176,16 @@ public class HSLFExtractor extends Abstr
 
    private void handleSlideEmbeddedResources(Slide slide, XHTMLContentHandler xhtml) 
                 throws TikaException, SAXException, IOException {
-      for( Shape shape : slide.getShapes() ) {
+      Shape[] shapes;
+      try {
+         shapes = slide.getShapes();
+      } catch(NullPointerException e) {
+         // Sometimes HSLF hits problems
+         // Please open POI bugs for any you come across!
+         return;
+      }
+      
+      for( Shape shape : shapes ) {
          if( shape instanceof OLEShape ) {
             OLEShape oleShape = (OLEShape)shape;