You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/02/23 10:33:51 UTC
[tika] 01/01: TIKA-3164 -- WIP POI 5.0.0
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3164-1.x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 7aa27326fcc285c52876a0ae3759d5fe93f62789
Author: tallison <ta...@apache.org>
AuthorDate: Tue Feb 23 05:33:26 2021 -0500
TIKA-3164 -- WIP POI 5.0.0
---
tika-parent/pom.xml | 2 +-
tika-parsers/pom.xml | 8 ++++++
.../tika/parser/microsoft/OutlookExtractor.java | 4 +--
.../microsoft/ooxml/OOXMLExtractorFactory.java | 30 ++++++++++++++--------
.../ooxml/SXSLFPowerPointExtractorDecorator.java | 4 +--
.../ooxml/XSLFPowerPointExtractorDecorator.java | 18 +++----------
.../microsoft/ooxml/xps/XPSTextExtractor.java | 25 ++++++++++++++++--
.../xslf/XSLFEventBasedPowerPointExtractor.java | 23 +++++++++++++++--
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 24 +++++++++++++++--
9 files changed, 101 insertions(+), 37 deletions(-)
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 626b1a2..baccf43 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -334,7 +334,7 @@
<maven.shade.version>3.2.4</maven.shade.version>
<rat.version>0.13</rat.version>
<!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
- <poi.version>4.1.2</poi.version>
+ <poi.version>5.0.0</poi.version>
<commons.compress.version>1.20</commons.compress.version>
<commons.io.version>2.8.0</commons.io.version>
<commons.lang3.version>3.11</commons.lang3.version>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index fa5e6f0..53e0a26 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -264,6 +264,14 @@
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>com.fasterxml.woodstox</groupId>
+ <artifactId>woodstox-core</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.xmlgraphics</groupId>
+ <artifactId>batik-all</artifactId>
+ </exclusion>
</exclusions>
</dependency>
<dependency>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index c2e27d6..b8c3d66 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -700,8 +700,8 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
for (RecipientChunks chunks : recipientChunks) {
Recipient r = new Recipient();
- r.displayName = (chunks.recipientDisplayNameChunk != null) ? chunks.recipientDisplayNameChunk.toString() : null;
- r.name = (chunks.recipientNameChunk != null) ? chunks.recipientNameChunk.toString() : null;
+ r.displayName = (chunks.getRecipientDisplayNameChunk() != null) ? chunks.getRecipientDisplayNameChunk().toString() : null;
+ r.name = (chunks.getRecipientNameChunk() != null) ? chunks.getRecipientNameChunk().toString() : null;
r.emailAddress = chunks.getRecipientEmailAddress();
List<PropertyValue> vals = chunks.getProperties().get(MAPIProperty.RECIPIENT_TYPE);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 15f2c33..e2dc17e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -22,9 +22,8 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.Locale;
-import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.poi.ooxml.POIXMLDocument;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
+import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
@@ -34,10 +33,9 @@ import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.util.LocaleUtil;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.extractor.XSLFExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFRelation;
-import org.apache.poi.xslf.usermodel.XSLFSlideShow;
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
@@ -64,6 +62,8 @@ import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import static org.apache.poi.ooxml.extractor.POIXMLExtractorFactory.setThreadPrefersEventExtractors;
+
/**
* Figures out the correct {@link OOXMLExtractor} for the supplied document and
* returns it.
@@ -72,13 +72,17 @@ public class OOXMLExtractorFactory {
private static final Logger LOG = LoggerFactory.getLogger(OOXMLExtractorFactory.class);
private static final int MAX_BUFFER_LENGTH = 1000000;
+ private static POIXMLExtractorFactory EXTRACTOR_FACTORY = new POIXMLExtractorFactory();
+
+ static {
+ setThreadPrefersEventExtractors(true);
+ }
public static void parse(
InputStream stream, ContentHandler baseHandler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
Locale locale = context.get(Locale.class, LocaleUtil.getUserLocale());
- ExtractorFactory.setThreadPrefersEventExtractors(true);
//if there's a problem opening the zip file;
//create a tmp file, and copy what you can read of it.
@@ -167,7 +171,7 @@ public class OOXMLExtractorFactory {
}
if (poiExtractor == null) {
- poiExtractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(pkg);
+ poiExtractor = EXTRACTOR_FACTORY.create(pkg);
}
POIXMLDocument document = poiExtractor.getDocument();
@@ -192,8 +196,8 @@ public class OOXMLExtractorFactory {
"The extractor returned was a " + poiExtractor
);
} else if (document instanceof XMLSlideShow) {
- extractor = new XSLFPowerPointExtractorDecorator(
- context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
+ extractor = new XSLFPowerPointExtractorDecorator( metadata,
+ context, (org.apache.poi.xslf.extractor.XSLFExtractor) poiExtractor);
} else if (document instanceof XWPFDocument) {
extractor = new XWPFWordExtractorDecorator( metadata,
context, (XWPFWordExtractor) poiExtractor);
@@ -279,7 +283,11 @@ public class OOXMLExtractorFactory {
}
String targetContentType = corePart.getContentType();
- XSLFRelation[] xslfRelations = org.apache.poi.xslf.extractor.XSLFPowerPointExtractor.SUPPORTED_TYPES;
+ //TODO make this static...or find what happened to SUPPORTED_TYPES
+ XSLFRelation[] xslfRelations = new XSLFRelation[] {
+ XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
+ XSLFRelation.PRESENTATIONML_TEMPLATE
+ };
for (int i = 0; i < xslfRelations.length; i++) {
XSLFRelation xslfRelation = xslfRelations[i];
@@ -287,7 +295,7 @@ public class OOXMLExtractorFactory {
if (eventBased) {
return new XSLFEventBasedPowerPointExtractor(pkg);
} else {
- return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
+ return new XSLFExtractor(new XMLSlideShow(pkg));
}
}
}
@@ -296,7 +304,7 @@ public class OOXMLExtractorFactory {
if (eventBased) {
return new XSLFEventBasedPowerPointExtractor(pkg);
} else {
- return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
+ return new XSLFExtractor(new XMLSlideShow(pkg));
}
}
return null;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index ac6e278..5350f30 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -33,7 +33,7 @@ import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.extractor.XSLFExtractor;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -95,7 +95,7 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
}
/**
- * @see XSLFPowerPointExtractor#getText()
+ * @see XSLFExtractor#getText()
*/
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
index c63fcb3..9b61d68 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
@@ -30,13 +30,11 @@ import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
-import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.Placeholder;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.extractor.XSLFExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFComment;
import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
-import org.apache.poi.xslf.usermodel.XSLFComments;
import org.apache.poi.xslf.usermodel.XSLFGraphicFrame;
import org.apache.poi.xslf.usermodel.XSLFGroupShape;
import org.apache.poi.xslf.usermodel.XSLFHyperlink;
@@ -73,23 +71,13 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
private Metadata metadata;
- public XSLFPowerPointExtractorDecorator(Metadata metadata, ParseContext context, XSLFPowerPointExtractor extractor) {
+ public XSLFPowerPointExtractorDecorator(Metadata metadata, ParseContext context, XSLFExtractor extractor) {
super(context, extractor);
this.metadata = metadata;
}
/**
- * use {@link XSLFPowerPointExtractorDecorator#XSLFPowerPointExtractorDecorator(Metadata, ParseContext, XSLFPowerPointExtractor)}
- * @param context
- * @param extractor
- */
- @Deprecated
- public XSLFPowerPointExtractorDecorator(ParseContext context, XSLFPowerPointExtractor extractor) {
- this(new Metadata(),context, extractor);
- }
-
- /**
- * @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText()
+ * @see org.apache.poi.xslf.extractor.XSLFExtractor#getText()
*/
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
index 0212920..a590d39 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
@@ -25,6 +25,7 @@ import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.xmlbeans.XmlException;
+import java.io.Closeable;
import java.io.IOException;
/**
@@ -32,13 +33,12 @@ import java.io.IOException;
* and keep the general framework similar to our other POI-integrated
* extractors.
*/
-public class XPSTextExtractor extends POIXMLTextExtractor {
+public class XPSTextExtractor implements POIXMLTextExtractor {
private final OPCPackage pkg;
private final POIXMLProperties properties;
public XPSTextExtractor(OPCPackage pkg) throws OpenXML4JException, XmlException, IOException {
- super((POIXMLDocument)null);
this.pkg = pkg;
this.properties = new POIXMLProperties(pkg);
@@ -53,6 +53,22 @@ public class XPSTextExtractor extends POIXMLTextExtractor {
public String getText() {
return null;
}
+
+ @Override
+ public void setCloseFilesystem(boolean b) {
+
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return false;
+ }
+
+ @Override
+ public Closeable getFilesystem() {
+ return null;
+ }
+
public POIXMLProperties.CoreProperties getCoreProperties() {
return this.properties.getCoreProperties();
}
@@ -64,4 +80,9 @@ public class XPSTextExtractor extends POIXMLTextExtractor {
public POIXMLProperties.CustomProperties getCustomProperties() {
return this.properties.getCustomProperties();
}
+
+ @Override
+ public POIXMLDocument getDocument() {
+ return null;
+ }
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
index bd5615d..4f666a7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
@@ -17,6 +17,7 @@
package org.apache.tika.parser.microsoft.ooxml.xslf;
+import java.io.Closeable;
import java.io.IOException;
import java.util.Date;
@@ -31,7 +32,7 @@ import org.apache.tika.parser.microsoft.ooxml.RunProperties;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.apache.xmlbeans.XmlException;
-public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
+public class XSLFEventBasedPowerPointExtractor implements POIXMLTextExtractor {
private OPCPackage container;
@@ -42,7 +43,6 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
}
public XSLFEventBasedPowerPointExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
- super((POIXMLDocument) null);
this.container = container;
this.properties = new POIXMLProperties(container);
}
@@ -76,6 +76,11 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
return this.properties.getCustomProperties();
}
+ @Override
+ public POIXMLDocument getDocument() {
+ return null;
+ }
+
@Override
public String getText() {
@@ -83,6 +88,20 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
return "";
}
+ @Override
+ public void setCloseFilesystem(boolean b) {
+
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return false;
+ }
+
+ @Override
+ public Closeable getFilesystem() {
+ return null;
+ }
private class XSLFToTextContentHandler implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 866bb78..d2ef6db 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -18,6 +18,7 @@
package org.apache.tika.parser.microsoft.ooxml.xwpf;
import javax.xml.parsers.ParserConfigurationException;
+import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
@@ -54,7 +55,7 @@ import org.xml.sax.XMLReader;
* Experimental class that is based on POI's XSSFEventBasedExcelExtractor
*
*/
-public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
+public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor {
private static final Logger LOG = LoggerFactory.getLogger(XWPFEventBasedWordExtractor.class);
@@ -66,7 +67,6 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
}
public XWPFEventBasedWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
- super((POIXMLDocument) null);
this.container = container;
this.properties = new POIXMLProperties(container);
}
@@ -100,6 +100,11 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
return this.properties.getCustomProperties();
}
+ @Override
+ public POIXMLDocument getDocument() {
+ return null;
+ }
+
@Override
public String getText() {
@@ -139,6 +144,21 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
return sb.toString();
}
+ @Override
+ public void setCloseFilesystem(boolean b) {
+
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return false;
+ }
+
+ @Override
+ public Closeable getFilesystem() {
+ return null;
+ }
+
private void handleDocumentPart(PackagePart documentPart, StringBuilder sb) throws IOException, SAXException {
//load the numbering/list manager and styles from the main document part