You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/09/30 18:51:20 UTC

svn commit: r1706079 - in /tika/trunk: ./ tika-parsers/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/ tika-parsers/src/test/java/org/apache/tika/parser/microsoft/

Author: tallison
Date: Wed Sep 30 16:51:19 2015
New Revision: 1706079

URL: http://svn.apache.org/viewvc?rev=1706079&view=rev
Log:
TIKA-1707: upgrade to POI 3.13

Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/pom.xml
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1706079&r1=1706078&r2=1706079&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed Sep 30 16:51:19 2015
@@ -1,4 +1,5 @@
 Release 1.11 - Current Development
+  * Upgrade to POI 3.13-final (via Andreas Beeker) (TIKA-1707).
 
   * Upgraded tika-batch to use Path throughout (TIKA-1747 and
     (TIKA-1754).

Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1706079&r1=1706078&r2=1706079&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Wed Sep 30 16:51:19 2015
@@ -35,7 +35,7 @@
   <url>http://tika.apache.org/</url>
 
   <properties>
-    <poi.version>3.13-beta1</poi.version>
+    <poi.version>3.13</poi.version>
     <!-- NOTE: sync codec version with POI -->
     <codec.version>1.9</codec.version>
     <!-- NOTE: sync tukaani version with commons-compress -->

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1706079&r1=1706078&r2=1706079&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Wed Sep 30 16:51:19 2015
@@ -18,23 +18,23 @@ package org.apache.tika.parser.microsoft
 
 import java.io.IOException;
 import java.util.HashSet;
+import java.util.List;
 
-import org.apache.poi.hslf.HSLFSlideShow;
 import org.apache.poi.hslf.model.Comment;
 import org.apache.poi.hslf.model.HeadersFooters;
-import org.apache.poi.hslf.model.MasterSheet;
-import org.apache.poi.hslf.model.Notes;
 import org.apache.poi.hslf.model.OLEShape;
-import org.apache.poi.hslf.model.Picture;
-import org.apache.poi.hslf.model.Shape;
-import org.apache.poi.hslf.model.Slide;
-import org.apache.poi.hslf.model.Table;
-import org.apache.poi.hslf.model.TableCell;
-import org.apache.poi.hslf.model.TextRun;
-import org.apache.poi.hslf.model.TextShape;
-import org.apache.poi.hslf.usermodel.ObjectData;
-import org.apache.poi.hslf.usermodel.PictureData;
-import org.apache.poi.hslf.usermodel.SlideShow;
+import org.apache.poi.hslf.usermodel.HSLFMasterSheet;
+import org.apache.poi.hslf.usermodel.HSLFNotes;
+import org.apache.poi.hslf.usermodel.HSLFObjectData;
+import org.apache.poi.hslf.usermodel.HSLFPictureData;
+import org.apache.poi.hslf.usermodel.HSLFShape;
+import org.apache.poi.hslf.usermodel.HSLFSlide;
+import org.apache.poi.hslf.usermodel.HSLFSlideShow;
+import org.apache.poi.hslf.usermodel.HSLFTable;
+import org.apache.poi.hslf.usermodel.HSLFTableCell;
+import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
+import org.apache.poi.hslf.usermodel.HSLFTextRun;
+import org.apache.poi.hslf.usermodel.HSLFTextShape;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.tika.exception.TikaException;
@@ -59,13 +59,12 @@ public class HSLFExtractor extends Abstr
             DirectoryNode root, XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
         HSLFSlideShow ss = new HSLFSlideShow(root);
-        SlideShow _show = new SlideShow(ss);
-        Slide[] _slides = _show.getSlides();
+        List<HSLFSlide> _slides = ss.getSlides();
 
         xhtml.startElement("div", "class", "slideShow");
 
       /* Iterate over slides and extract text */
-        for (Slide slide : _slides) {
+        for (HSLFSlide slide : _slides) {
             xhtml.startElement("div", "class", "slide");
 
             // Slide header, if present
@@ -83,17 +82,17 @@ public class HSLFExtractor extends Abstr
 
             // Slide text
             {
-                xhtml.startElement("p", "class", "slide-content");
+                xhtml.startElement("div", "class", "slide-content");
 
-                textRunsToText(xhtml, slide.getTextRuns());
+                textRunsToText(xhtml, slide.getTextParagraphs());
 
-                xhtml.endElement("p");
+                xhtml.endElement("div");
             }
 
             // Table text
-            for (Shape shape : slide.getShapes()) {
-                if (shape instanceof Table) {
-                    extractTableText(xhtml, (Table) shape);
+            for (HSLFShape shape : slide.getShapes()) {
+                if (shape instanceof HSLFTable) {
+                    extractTableText(xhtml, (HSLFTable) shape);
                 }
             }
 
@@ -138,11 +137,11 @@ public class HSLFExtractor extends Abstr
 
       /* notes */
         xhtml.startElement("div", "class", "slideNotes");
-        HashSet<Integer> seenNotes = new HashSet<Integer>();
-        HeadersFooters hf = _show.getNotesHeadersFooters();
+        HashSet<Integer> seenNotes = new HashSet<>();
+        HeadersFooters hf = ss.getNotesHeadersFooters();
 
-        for (Slide slide : _slides) {
-            Notes notes = slide.getNotesSheet();
+        for (HSLFSlide slide : _slides) {
+            HSLFNotes notes = slide.getNotes();
             if (notes == null) {
                 continue;
             }
@@ -160,7 +159,7 @@ public class HSLFExtractor extends Abstr
             }
 
             // Notes text
-            textRunsToText(xhtml, notes.getTextRuns());
+            textRunsToText(xhtml, notes.getTextParagraphs());
 
             // Repeat the notes footer, if set
             if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
@@ -170,25 +169,25 @@ public class HSLFExtractor extends Abstr
             }
         }
 
-        handleSlideEmbeddedPictures(_show, xhtml);
+        handleSlideEmbeddedPictures(ss, xhtml);
 
         xhtml.endElement("div");
     }
 
-    private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master) throws SAXException {
+    private void extractMaster(XHTMLContentHandler xhtml, HSLFMasterSheet master) throws SAXException {
         if (master == null) {
             return;
         }
-        Shape[] shapes = master.getShapes();
-        if (shapes == null || shapes.length == 0) {
+        List<HSLFShape> shapes = master.getShapes();
+        if (shapes == null || shapes.isEmpty()) {
             return;
         }
 
         xhtml.startElement("div", "class", "slide-master-content");
-        for (Shape shape : shapes) {
-            if (shape != null && !MasterSheet.isPlaceholder(shape)) {
-                if (shape instanceof TextShape) {
-                    TextShape tsh = (TextShape) shape;
+        for (HSLFShape shape : shapes) {
+            if (shape != null && !HSLFMasterSheet.isPlaceholder(shape)) {
+                if (shape instanceof HSLFTextShape) {
+                	HSLFTextShape tsh = (HSLFTextShape) shape;
                     String text = tsh.getText();
                     if (text != null) {
                         xhtml.element("p", text);
@@ -199,12 +198,12 @@ public class HSLFExtractor extends Abstr
         xhtml.endElement("div");
     }
 
-    private void extractTableText(XHTMLContentHandler xhtml, Table shape) throws SAXException {
+    private void extractTableText(XHTMLContentHandler xhtml, HSLFTable shape) throws SAXException {
         xhtml.startElement("table");
         for (int row = 0; row < shape.getNumberOfRows(); row++) {
             xhtml.startElement("tr");
             for (int col = 0; col < shape.getNumberOfColumns(); col++) {
-                TableCell cell = shape.getCell(row, col);
+                HSLFTableCell cell = shape.getCell(row, col);
                 //insert empty string for empty cell if cell is null
                 String txt = "";
                 if (cell != null) {
@@ -217,48 +216,59 @@ public class HSLFExtractor extends Abstr
         xhtml.endElement("table");
     }
 
-    private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs) throws SAXException {
-        if (runs == null) {
+    private void textRunsToText(XHTMLContentHandler xhtml, List<List<HSLFTextParagraph>> paragraphsList) throws SAXException {
+        if (paragraphsList == null) {
             return;
         }
 
-        for (TextRun run : runs) {
-            if (run != null) {
-                // Leaving in wisdom from TIKA-712 for easy revert.
-                // Avoid boiler-plate text on the master slide (0
-                // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
-                //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) {
-                String txt = run.getText();
-                if (txt != null) {
-                    xhtml.characters(txt);
-                    xhtml.startElement("br");
-                    xhtml.endElement("br");
-                }
+        for (List<HSLFTextParagraph> run : paragraphsList) {
+            // Leaving in wisdom from TIKA-712 for easy revert.
+            // Avoid boiler-plate text on the master slide (0
+            // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
+            //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) {
+
+        	for (HSLFTextParagraph htp : run) {
+        		xhtml.startElement("p");
+
+        		for (HSLFTextRun htr : htp.getTextRuns()) {
+        			String line = htr.getRawText();
+        			if (line != null) {
+        				boolean isfirst = true;
+        				for (String fragment : line.split("\\u000b")){
+        					if (!isfirst)  {
+        	                    xhtml.startElement("br");
+        	                    xhtml.endElement("br");
+        					}
+        					isfirst = false;
+        					xhtml.characters(fragment.trim());
+        				}
+        			}
+        		}
+                xhtml.endElement("p");
+
             }
+        	
         }
     }
 
-    private void handleSlideEmbeddedPictures(SlideShow slideshow, XHTMLContentHandler xhtml)
+    private void handleSlideEmbeddedPictures(HSLFSlideShow slideshow, XHTMLContentHandler xhtml)
             throws TikaException, SAXException, IOException {
-        for (PictureData pic : slideshow.getPictureData()) {
-            String mediaType = null;
+        for (HSLFPictureData pic : slideshow.getPictureData()) {
+            String mediaType;
 
             switch (pic.getType()) {
-                case Picture.EMF:
+                case EMF:
                     mediaType = "application/x-emf";
                     break;
-                case Picture.JPEG:
-                    mediaType = "image/jpeg";
-                    break;
-                case Picture.PNG:
-                    mediaType = "image/png";
-                    break;
-                case Picture.WMF:
+                case WMF:
                     mediaType = "application/x-msmetafile";
                     break;
-                case Picture.DIB:
+                case DIB:
                     mediaType = "image/bmp";
                     break;
+                default:
+            		mediaType = pic.getContentType();
+            		break;
             }
 
             handleEmbeddedResource(
@@ -267,9 +277,9 @@ public class HSLFExtractor extends Abstr
         }
     }
 
-    private void handleSlideEmbeddedResources(Slide slide, XHTMLContentHandler xhtml)
+    private void handleSlideEmbeddedResources(HSLFSlide slide, XHTMLContentHandler xhtml)
             throws TikaException, SAXException, IOException {
-        Shape[] shapes;
+        List<HSLFShape> shapes;
         try {
             shapes = slide.getShapes();
         } catch (NullPointerException e) {
@@ -278,10 +288,10 @@ public class HSLFExtractor extends Abstr
             return;
         }
 
-        for (Shape shape : shapes) {
+        for (HSLFShape shape : shapes) {
             if (shape instanceof OLEShape) {
                 OLEShape oleShape = (OLEShape) shape;
-                ObjectData data = null;
+                HSLFObjectData data = null;
                 try {
                     data = oleShape.getObjectData();
                 } catch (NullPointerException e) {

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1706079&r1=1706078&r2=1706079&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java Wed Sep 30 16:51:19 2015
@@ -34,11 +34,14 @@ import org.apache.poi.xslf.usermodel.XML
 import org.apache.poi.xslf.usermodel.XSLFComments;
 import org.apache.poi.xslf.usermodel.XSLFGraphicFrame;
 import org.apache.poi.xslf.usermodel.XSLFGroupShape;
+import org.apache.poi.xslf.usermodel.XSLFNotes;
+import org.apache.poi.xslf.usermodel.XSLFNotesMaster;
 import org.apache.poi.xslf.usermodel.XSLFPictureShape;
 import org.apache.poi.xslf.usermodel.XSLFRelation;
 import org.apache.poi.xslf.usermodel.XSLFShape;
 import org.apache.poi.xslf.usermodel.XSLFSheet;
 import org.apache.poi.xslf.usermodel.XSLFSlide;
+import org.apache.poi.xslf.usermodel.XSLFSlideLayout;
 import org.apache.poi.xslf.usermodel.XSLFTable;
 import org.apache.poi.xslf.usermodel.XSLFTableCell;
 import org.apache.poi.xslf.usermodel.XSLFTableRow;
@@ -66,7 +69,7 @@ public class XSLFPowerPointExtractorDeco
     protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
         XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
 
-        XSLFSlide[] slides = slideShow.getSlides();
+        List<XSLFSlide> slides = slideShow.getSlides();
         for (XSLFSlide slide : slides) {
             String slideDesc;
             if (slide.getPackagePart() != null && slide.getPackagePart().getPartName() != null) {
@@ -80,7 +83,7 @@ public class XSLFPowerPointExtractorDeco
             extractContent(slide.getShapes(), false, xhtml, slideDesc);
 
             // slide layout which is the master sheet for this slide
-            XSLFSheet slideLayout = slide.getMasterSheet();
+            XSLFSlideLayout slideLayout = slide.getMasterSheet();
             extractContent(slideLayout.getShapes(), true, xhtml, null);
 
             // slide master which is the master sheet for all text layouts
@@ -88,12 +91,12 @@ public class XSLFPowerPointExtractorDeco
             extractContent(slideMaster.getShapes(), true, xhtml, null);
 
             // notes (if present)
-            XSLFSheet slideNotes = slide.getNotes();
+            XSLFNotes slideNotes = slide.getNotes();
             if (slideNotes != null) {
                 extractContent(slideNotes.getShapes(), false, xhtml, slideDesc);
 
                 // master sheet for this notes
-                XSLFSheet notesMaster = slideNotes.getMasterSheet();
+                XSLFNotesMaster notesMaster = slideNotes.getMasterSheet();
                 extractContent(notesMaster.getShapes(), true, xhtml, null);
             }
 
@@ -108,7 +111,7 @@ public class XSLFPowerPointExtractorDeco
         }
     }
 
-    private void extractContent(XSLFShape[] shapes, boolean skipPlaceholders, XHTMLContentHandler xhtml, String slideDesc)
+    private void extractContent(List<? extends XSLFShape> shapes, boolean skipPlaceholders, XHTMLContentHandler xhtml, String slideDesc)
             throws SAXException {
         for (XSLFShape sh : shapes) {
             if (sh instanceof XSLFTextShape) {
@@ -126,7 +129,7 @@ public class XSLFPowerPointExtractorDeco
                 XSLFTable tbl = (XSLFTable) sh;
                 for (XSLFTableRow row : tbl) {
                     List<XSLFTableCell> cells = row.getCells();
-                    extractContent(cells.toArray(new XSLFTableCell[cells.size()]), skipPlaceholders, xhtml, slideDesc);
+                    extractContent(cells, skipPlaceholders, xhtml, slideDesc);
                 }
             } else if (sh instanceof XSLFGraphicFrame) {
                 XSLFGraphicFrame frame = (XSLFGraphicFrame) sh;
@@ -175,7 +178,7 @@ public class XSLFPowerPointExtractorDeco
      */
     @Override
     protected List<PackagePart> getMainDocumentParts() throws TikaException {
-        List<PackagePart> parts = new ArrayList<PackagePart>();
+        List<PackagePart> parts = new ArrayList<>();
         XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
         XSLFSlideShow document = null;
         try {

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1706079&r1=1706078&r2=1706079&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java Wed Sep 30 16:51:19 2015
@@ -234,8 +234,9 @@ public class PowerPointParserTest extend
         //autodate automatically.  For pptx, where value is stored,
         //value is extracted.  For ppt, however, no date is extracted.
         XMLResult result = getXML("testPPT_autodate.ppt");
+        System.out.println(result.xml);
         assertContains(
-                "<p class=\"slide-content\">Now<br />\n*<br />\n*<br />",
+                "<div class=\"slide-content\"><p>Now</p>",
                 result.xml);
     }
 }