You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/09/30 18:51:20 UTC
svn commit: r1706079 - in /tika/trunk: ./ tika-parsers/
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/
tika-parsers/src/test/java/org/apache/tika/parser/microsoft/
Author: tallison
Date: Wed Sep 30 16:51:19 2015
New Revision: 1706079
URL: http://svn.apache.org/viewvc?rev=1706079&view=rev
Log:
TIKA-1707: upgrade to POI 3.13
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1706079&r1=1706078&r2=1706079&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed Sep 30 16:51:19 2015
@@ -1,4 +1,5 @@
Release 1.11 - Current Development
+ * Upgrade to POI 3.13-final (via Andreas Beeker) (TIKA-1707).
* Upgraded tika-batch to use Path throughout (TIKA-1747 and
(TIKA-1754).
Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1706079&r1=1706078&r2=1706079&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Wed Sep 30 16:51:19 2015
@@ -35,7 +35,7 @@
<url>http://tika.apache.org/</url>
<properties>
- <poi.version>3.13-beta1</poi.version>
+ <poi.version>3.13</poi.version>
<!-- NOTE: sync codec version with POI -->
<codec.version>1.9</codec.version>
<!-- NOTE: sync tukaani version with commons-compress -->
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1706079&r1=1706078&r2=1706079&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Wed Sep 30 16:51:19 2015
@@ -18,23 +18,23 @@ package org.apache.tika.parser.microsoft
import java.io.IOException;
import java.util.HashSet;
+import java.util.List;
-import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Comment;
import org.apache.poi.hslf.model.HeadersFooters;
-import org.apache.poi.hslf.model.MasterSheet;
-import org.apache.poi.hslf.model.Notes;
import org.apache.poi.hslf.model.OLEShape;
-import org.apache.poi.hslf.model.Picture;
-import org.apache.poi.hslf.model.Shape;
-import org.apache.poi.hslf.model.Slide;
-import org.apache.poi.hslf.model.Table;
-import org.apache.poi.hslf.model.TableCell;
-import org.apache.poi.hslf.model.TextRun;
-import org.apache.poi.hslf.model.TextShape;
-import org.apache.poi.hslf.usermodel.ObjectData;
-import org.apache.poi.hslf.usermodel.PictureData;
-import org.apache.poi.hslf.usermodel.SlideShow;
+import org.apache.poi.hslf.usermodel.HSLFMasterSheet;
+import org.apache.poi.hslf.usermodel.HSLFNotes;
+import org.apache.poi.hslf.usermodel.HSLFObjectData;
+import org.apache.poi.hslf.usermodel.HSLFPictureData;
+import org.apache.poi.hslf.usermodel.HSLFShape;
+import org.apache.poi.hslf.usermodel.HSLFSlide;
+import org.apache.poi.hslf.usermodel.HSLFSlideShow;
+import org.apache.poi.hslf.usermodel.HSLFTable;
+import org.apache.poi.hslf.usermodel.HSLFTableCell;
+import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
+import org.apache.poi.hslf.usermodel.HSLFTextRun;
+import org.apache.poi.hslf.usermodel.HSLFTextShape;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.exception.TikaException;
@@ -59,13 +59,12 @@ public class HSLFExtractor extends Abstr
DirectoryNode root, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
HSLFSlideShow ss = new HSLFSlideShow(root);
- SlideShow _show = new SlideShow(ss);
- Slide[] _slides = _show.getSlides();
+ List<HSLFSlide> _slides = ss.getSlides();
xhtml.startElement("div", "class", "slideShow");
/* Iterate over slides and extract text */
- for (Slide slide : _slides) {
+ for (HSLFSlide slide : _slides) {
xhtml.startElement("div", "class", "slide");
// Slide header, if present
@@ -83,17 +82,17 @@ public class HSLFExtractor extends Abstr
// Slide text
{
- xhtml.startElement("p", "class", "slide-content");
+ xhtml.startElement("div", "class", "slide-content");
- textRunsToText(xhtml, slide.getTextRuns());
+ textRunsToText(xhtml, slide.getTextParagraphs());
- xhtml.endElement("p");
+ xhtml.endElement("div");
}
// Table text
- for (Shape shape : slide.getShapes()) {
- if (shape instanceof Table) {
- extractTableText(xhtml, (Table) shape);
+ for (HSLFShape shape : slide.getShapes()) {
+ if (shape instanceof HSLFTable) {
+ extractTableText(xhtml, (HSLFTable) shape);
}
}
@@ -138,11 +137,11 @@ public class HSLFExtractor extends Abstr
/* notes */
xhtml.startElement("div", "class", "slideNotes");
- HashSet<Integer> seenNotes = new HashSet<Integer>();
- HeadersFooters hf = _show.getNotesHeadersFooters();
+ HashSet<Integer> seenNotes = new HashSet<>();
+ HeadersFooters hf = ss.getNotesHeadersFooters();
- for (Slide slide : _slides) {
- Notes notes = slide.getNotesSheet();
+ for (HSLFSlide slide : _slides) {
+ HSLFNotes notes = slide.getNotes();
if (notes == null) {
continue;
}
@@ -160,7 +159,7 @@ public class HSLFExtractor extends Abstr
}
// Notes text
- textRunsToText(xhtml, notes.getTextRuns());
+ textRunsToText(xhtml, notes.getTextParagraphs());
// Repeat the notes footer, if set
if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
@@ -170,25 +169,25 @@ public class HSLFExtractor extends Abstr
}
}
- handleSlideEmbeddedPictures(_show, xhtml);
+ handleSlideEmbeddedPictures(ss, xhtml);
xhtml.endElement("div");
}
- private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master) throws SAXException {
+ private void extractMaster(XHTMLContentHandler xhtml, HSLFMasterSheet master) throws SAXException {
if (master == null) {
return;
}
- Shape[] shapes = master.getShapes();
- if (shapes == null || shapes.length == 0) {
+ List<HSLFShape> shapes = master.getShapes();
+ if (shapes == null || shapes.isEmpty()) {
return;
}
xhtml.startElement("div", "class", "slide-master-content");
- for (Shape shape : shapes) {
- if (shape != null && !MasterSheet.isPlaceholder(shape)) {
- if (shape instanceof TextShape) {
- TextShape tsh = (TextShape) shape;
+ for (HSLFShape shape : shapes) {
+ if (shape != null && !HSLFMasterSheet.isPlaceholder(shape)) {
+ if (shape instanceof HSLFTextShape) {
+ HSLFTextShape tsh = (HSLFTextShape) shape;
String text = tsh.getText();
if (text != null) {
xhtml.element("p", text);
@@ -199,12 +198,12 @@ public class HSLFExtractor extends Abstr
xhtml.endElement("div");
}
- private void extractTableText(XHTMLContentHandler xhtml, Table shape) throws SAXException {
+ private void extractTableText(XHTMLContentHandler xhtml, HSLFTable shape) throws SAXException {
xhtml.startElement("table");
for (int row = 0; row < shape.getNumberOfRows(); row++) {
xhtml.startElement("tr");
for (int col = 0; col < shape.getNumberOfColumns(); col++) {
- TableCell cell = shape.getCell(row, col);
+ HSLFTableCell cell = shape.getCell(row, col);
//insert empty string for empty cell if cell is null
String txt = "";
if (cell != null) {
@@ -217,48 +216,59 @@ public class HSLFExtractor extends Abstr
xhtml.endElement("table");
}
- private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs) throws SAXException {
- if (runs == null) {
+ private void textRunsToText(XHTMLContentHandler xhtml, List<List<HSLFTextParagraph>> paragraphsList) throws SAXException {
+ if (paragraphsList == null) {
return;
}
- for (TextRun run : runs) {
- if (run != null) {
- // Leaving in wisdom from TIKA-712 for easy revert.
- // Avoid boiler-plate text on the master slide (0
- // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
- //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) {
- String txt = run.getText();
- if (txt != null) {
- xhtml.characters(txt);
- xhtml.startElement("br");
- xhtml.endElement("br");
- }
+ for (List<HSLFTextParagraph> run : paragraphsList) {
+ // Leaving in wisdom from TIKA-712 for easy revert.
+ // Avoid boiler-plate text on the master slide (0
+ // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
+ //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) {
+
+ for (HSLFTextParagraph htp : run) {
+ xhtml.startElement("p");
+
+ for (HSLFTextRun htr : htp.getTextRuns()) {
+ String line = htr.getRawText();
+ if (line != null) {
+ boolean isfirst = true;
+ for (String fragment : line.split("\\u000b")){
+ if (!isfirst) {
+ xhtml.startElement("br");
+ xhtml.endElement("br");
+ }
+ isfirst = false;
+ xhtml.characters(fragment.trim());
+ }
+ }
+ }
+ xhtml.endElement("p");
+
}
+
}
}
- private void handleSlideEmbeddedPictures(SlideShow slideshow, XHTMLContentHandler xhtml)
+ private void handleSlideEmbeddedPictures(HSLFSlideShow slideshow, XHTMLContentHandler xhtml)
throws TikaException, SAXException, IOException {
- for (PictureData pic : slideshow.getPictureData()) {
- String mediaType = null;
+ for (HSLFPictureData pic : slideshow.getPictureData()) {
+ String mediaType;
switch (pic.getType()) {
- case Picture.EMF:
+ case EMF:
mediaType = "application/x-emf";
break;
- case Picture.JPEG:
- mediaType = "image/jpeg";
- break;
- case Picture.PNG:
- mediaType = "image/png";
- break;
- case Picture.WMF:
+ case WMF:
mediaType = "application/x-msmetafile";
break;
- case Picture.DIB:
+ case DIB:
mediaType = "image/bmp";
break;
+ default:
+ mediaType = pic.getContentType();
+ break;
}
handleEmbeddedResource(
@@ -267,9 +277,9 @@ public class HSLFExtractor extends Abstr
}
}
- private void handleSlideEmbeddedResources(Slide slide, XHTMLContentHandler xhtml)
+ private void handleSlideEmbeddedResources(HSLFSlide slide, XHTMLContentHandler xhtml)
throws TikaException, SAXException, IOException {
- Shape[] shapes;
+ List<HSLFShape> shapes;
try {
shapes = slide.getShapes();
} catch (NullPointerException e) {
@@ -278,10 +288,10 @@ public class HSLFExtractor extends Abstr
return;
}
- for (Shape shape : shapes) {
+ for (HSLFShape shape : shapes) {
if (shape instanceof OLEShape) {
OLEShape oleShape = (OLEShape) shape;
- ObjectData data = null;
+ HSLFObjectData data = null;
try {
data = oleShape.getObjectData();
} catch (NullPointerException e) {
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1706079&r1=1706078&r2=1706079&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java Wed Sep 30 16:51:19 2015
@@ -34,11 +34,14 @@ import org.apache.poi.xslf.usermodel.XML
import org.apache.poi.xslf.usermodel.XSLFComments;
import org.apache.poi.xslf.usermodel.XSLFGraphicFrame;
import org.apache.poi.xslf.usermodel.XSLFGroupShape;
+import org.apache.poi.xslf.usermodel.XSLFNotes;
+import org.apache.poi.xslf.usermodel.XSLFNotesMaster;
import org.apache.poi.xslf.usermodel.XSLFPictureShape;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xslf.usermodel.XSLFShape;
import org.apache.poi.xslf.usermodel.XSLFSheet;
import org.apache.poi.xslf.usermodel.XSLFSlide;
+import org.apache.poi.xslf.usermodel.XSLFSlideLayout;
import org.apache.poi.xslf.usermodel.XSLFTable;
import org.apache.poi.xslf.usermodel.XSLFTableCell;
import org.apache.poi.xslf.usermodel.XSLFTableRow;
@@ -66,7 +69,7 @@ public class XSLFPowerPointExtractorDeco
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
- XSLFSlide[] slides = slideShow.getSlides();
+ List<XSLFSlide> slides = slideShow.getSlides();
for (XSLFSlide slide : slides) {
String slideDesc;
if (slide.getPackagePart() != null && slide.getPackagePart().getPartName() != null) {
@@ -80,7 +83,7 @@ public class XSLFPowerPointExtractorDeco
extractContent(slide.getShapes(), false, xhtml, slideDesc);
// slide layout which is the master sheet for this slide
- XSLFSheet slideLayout = slide.getMasterSheet();
+ XSLFSlideLayout slideLayout = slide.getMasterSheet();
extractContent(slideLayout.getShapes(), true, xhtml, null);
// slide master which is the master sheet for all text layouts
@@ -88,12 +91,12 @@ public class XSLFPowerPointExtractorDeco
extractContent(slideMaster.getShapes(), true, xhtml, null);
// notes (if present)
- XSLFSheet slideNotes = slide.getNotes();
+ XSLFNotes slideNotes = slide.getNotes();
if (slideNotes != null) {
extractContent(slideNotes.getShapes(), false, xhtml, slideDesc);
// master sheet for this notes
- XSLFSheet notesMaster = slideNotes.getMasterSheet();
+ XSLFNotesMaster notesMaster = slideNotes.getMasterSheet();
extractContent(notesMaster.getShapes(), true, xhtml, null);
}
@@ -108,7 +111,7 @@ public class XSLFPowerPointExtractorDeco
}
}
- private void extractContent(XSLFShape[] shapes, boolean skipPlaceholders, XHTMLContentHandler xhtml, String slideDesc)
+ private void extractContent(List<? extends XSLFShape> shapes, boolean skipPlaceholders, XHTMLContentHandler xhtml, String slideDesc)
throws SAXException {
for (XSLFShape sh : shapes) {
if (sh instanceof XSLFTextShape) {
@@ -126,7 +129,7 @@ public class XSLFPowerPointExtractorDeco
XSLFTable tbl = (XSLFTable) sh;
for (XSLFTableRow row : tbl) {
List<XSLFTableCell> cells = row.getCells();
- extractContent(cells.toArray(new XSLFTableCell[cells.size()]), skipPlaceholders, xhtml, slideDesc);
+ extractContent(cells, skipPlaceholders, xhtml, slideDesc);
}
} else if (sh instanceof XSLFGraphicFrame) {
XSLFGraphicFrame frame = (XSLFGraphicFrame) sh;
@@ -175,7 +178,7 @@ public class XSLFPowerPointExtractorDeco
*/
@Override
protected List<PackagePart> getMainDocumentParts() throws TikaException {
- List<PackagePart> parts = new ArrayList<PackagePart>();
+ List<PackagePart> parts = new ArrayList<>();
XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
XSLFSlideShow document = null;
try {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1706079&r1=1706078&r2=1706079&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java Wed Sep 30 16:51:19 2015
@@ -234,8 +234,9 @@ public class PowerPointParserTest extend
//autodate automatically. For pptx, where value is stored,
//value is extracted. For ppt, however, no date is extracted.
XMLResult result = getXML("testPPT_autodate.ppt");
+ System.out.println(result.xml);
assertContains(
- "<p class=\"slide-content\">Now<br />\n*<br />\n*<br />",
+ "<div class=\"slide-content\"><p>Now</p>",
result.xml);
}
}