You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/11/08 20:14:54 UTC
[tika] 01/01: TIKA-3164 -- First attempt -- do not merge
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3164
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 74972671eef2c20cdae21bcacdf3f6d19c7a8757
Author: tballison <ta...@apache.org>
AuthorDate: Mon Nov 8 15:14:45 2021 -0500
TIKA-3164 -- First attempt -- do not merge
---
tika-parent/pom.xml | 2 +-
.../tika/parser/microsoft/OutlookExtractor.java | 7 +++--
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 4 ---
.../parser/microsoft/ooxml/MetadataExtractor.java | 7 -----
.../microsoft/ooxml/OOXMLExtractorFactory.java | 33 +++++++++++++---------
.../ooxml/SXSLFPowerPointExtractorDecorator.java | 4 +--
.../ooxml/XSLFPowerPointExtractorDecorator.java | 18 ++----------
.../ooxml/XSSFExcelExtractorDecorator.java | 6 ++--
.../microsoft/ooxml/xps/XPSTextExtractor.java | 24 ++++++++++++++--
.../xslf/XSLFEventBasedPowerPointExtractor.java | 24 ++++++++++++++--
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 24 ++++++++++++++--
11 files changed, 99 insertions(+), 54 deletions(-)
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index ca04560..d0b673f 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -353,7 +353,7 @@
<pax.exam.version>4.11.0</pax.exam.version>
<pdfbox.version>2.0.24</pdfbox.version>
<!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
- <poi.version>4.1.2</poi.version>
+ <poi.version>5.1.0</poi.version>
<quartz.version>2.3.2</quartz.version>
<rome.version>1.16.0</rome.version>
<scm.version>1.12.0</scm.version>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 2ee27da..fd884de 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -662,9 +662,10 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
for (RecipientChunks chunks : recipientChunks) {
Recipient r = new Recipient();
- r.displayName = (chunks.recipientDisplayNameChunk != null) ?
- chunks.recipientDisplayNameChunk.toString() : null;
- r.name = (chunks.recipientNameChunk != null) ? chunks.recipientNameChunk.toString() :
+ r.displayName = (chunks.getRecipientDisplayNameChunk() != null) ?
+ chunks.getRecipientDisplayNameChunk().toString() : null;
+ r.name = (chunks.getRecipientNameChunk() != null) ?
+ chunks.getRecipientNameChunk().toString() :
null;
r.emailAddress = chunks.getRecipientEmailAddress();
List<PropertyValue> vals = chunks.getProperties().get(MAPIProperty.RECIPIENT_TYPE);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index f23ae12..15bb7a6 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -30,7 +30,6 @@ import java.util.Map;
import java.util.Set;
import org.apache.poi.ooxml.POIXMLDocument;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
@@ -94,9 +93,6 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
private static final String TYPE_OLE_OBJECT =
"application/vnd.openxmlformats-officedocument.oleObject";
- static {
- ExtractorFactory.setAllThreadsPreferEventExtractors(true);
- }
private final EmbeddedDocumentExtractor embeddedExtractor;
private final ParseContext context;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index 09252e9..97efe3e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -23,7 +23,6 @@ import java.util.Optional;
import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
-import org.apache.poi.openxml4j.util.Nullable;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
@@ -270,12 +269,6 @@ public class MetadataExtractor {
}
}
- private void setProperty(Metadata metadata, String name, Nullable<?> value) {
- if (value.getValue() != null) {
- setProperty(metadata, name, value.getValue().toString());
- }
- }
-
private void setProperty(Metadata metadata, Property property, String value) {
if (value != null) {
metadata.set(property, value);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 9f8db62..c15b003 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -25,7 +25,7 @@ import java.util.Locale;
import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.poi.ooxml.POIXMLDocument;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
+import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
@@ -34,10 +34,9 @@ import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.util.LocaleUtil;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.extractor.XSLFExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFRelation;
-import org.apache.poi.xslf.usermodel.XSLFSlideShow;
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
@@ -74,11 +73,21 @@ public class OOXMLExtractorFactory {
private static final Logger LOG = LoggerFactory.getLogger(OOXMLExtractorFactory.class);
private static final int MAX_BUFFER_LENGTH = 1000000;
+ private static POIXMLExtractorFactory EXTRACTOR_FACTORY = new POIXMLExtractorFactory();
+
+ //TODO find what happened to SUPPORTED_TYPES
+ private static XSLFRelation[] XSLF_RELATIONS = new XSLFRelation[] {
+ XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
+ XSLFRelation.PRESENTATIONML,
+ XSLFRelation.PRESENTATIONML_TEMPLATE, XSLFRelation.PRESENTATION_MACRO
+ };
+ static {
+ POIXMLExtractorFactory.setAllThreadsPreferEventExtractors(true);
+ }
public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
Locale locale = context.get(Locale.class, LocaleUtil.getUserLocale());
- ExtractorFactory.setThreadPrefersEventExtractors(true);
//if there's a problem opening the zip file;
//create a tmp file, and copy what you can read of it.
@@ -186,7 +195,7 @@ public class OOXMLExtractorFactory {
}
if (poiExtractor == null) {
- poiExtractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(pkg);
+ poiExtractor = EXTRACTOR_FACTORY.create(pkg);
}
POIXMLDocument document = poiExtractor.getDocument();
@@ -212,8 +221,8 @@ public class OOXMLExtractorFactory {
" found. " +
"The extractor returned was a " + poiExtractor);
} else if (document instanceof XMLSlideShow) {
- extractor = new XSLFPowerPointExtractorDecorator(context,
- (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
+ extractor = new XSLFPowerPointExtractorDecorator(metadata, context,
+ (org.apache.poi.xslf.extractor.XSLFExtractor) poiExtractor);
} else if (document instanceof XWPFDocument) {
extractor = new XWPFWordExtractorDecorator(metadata, context,
(XWPFWordExtractor) poiExtractor);
@@ -302,15 +311,13 @@ public class OOXMLExtractorFactory {
}
String targetContentType = corePart.getContentType();
- XSLFRelation[] xslfRelations =
- org.apache.poi.xslf.extractor.XSLFPowerPointExtractor.SUPPORTED_TYPES;
-
- for (XSLFRelation xslfRelation : xslfRelations) {
+ for (int i = 0; i < XSLF_RELATIONS.length; i++) {
+ XSLFRelation xslfRelation = XSLF_RELATIONS[i];
if (xslfRelation.getContentType().equals(targetContentType)) {
if (eventBased) {
return new XSLFEventBasedPowerPointExtractor(pkg);
} else {
- return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
+ return new XSLFExtractor(new XMLSlideShow(pkg));
}
}
}
@@ -319,7 +326,7 @@ public class OOXMLExtractorFactory {
if (eventBased) {
return new XSLFEventBasedPowerPointExtractor(pkg);
} else {
- return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
+ return new XSLFExtractor(new XMLSlideShow(pkg));
}
}
return null;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index 433804a..b24284a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -33,7 +33,7 @@ import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.extractor.XSLFExtractor;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
@@ -96,7 +96,7 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
}
/**
- * @see XSLFPowerPointExtractor#getText()
+ * @see XSLFExtractor#getText()
*/
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
index 7994046..8501307 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
@@ -31,7 +31,7 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.sl.usermodel.Placeholder;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.extractor.XSLFExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFComment;
import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
@@ -74,26 +74,14 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
private Metadata metadata;
public XSLFPowerPointExtractorDecorator(Metadata metadata, ParseContext context,
- XSLFPowerPointExtractor extractor) {
+ XSLFExtractor extractor) {
super(context, extractor);
this.metadata = metadata;
}
- /**
- * use {@link XSLFPowerPointExtractorDecorator#XSLFPowerPointExtractorDecorator(Metadata,
- * ParseContext, XSLFPowerPointExtractor)}
- *
- * @param context
- * @param extractor
- */
- @Deprecated
- public XSLFPowerPointExtractorDecorator(ParseContext context,
- XSLFPowerPointExtractor extractor) {
- this(new Metadata(), context, extractor);
- }
/**
- * @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText()
+ * @see org.apache.poi.xslf.extractor.XSLFExtractor#getText()
*/
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 9d4949a..3c1b107 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -46,7 +46,7 @@ import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
-import org.apache.poi.xssf.model.CommentsTable;
+import org.apache.poi.xssf.model.Comments;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.apache.poi.xssf.usermodel.XSSFDrawing;
@@ -159,7 +159,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
addDrawingHyperLinks(sheetPart);
sheetParts.add(sheetPart);
- CommentsTable comments = iter.getSheetComments();
+ Comments comments = iter.getSheetComments();
// Start, and output the sheet name
xhtml.startElement("div");
@@ -344,7 +344,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
}
- public void processSheet(SheetContentsHandler sheetContentsExtractor, CommentsTable comments,
+ public void processSheet(SheetContentsHandler sheetContentsExtractor, Comments comments,
StylesTable styles, ReadOnlySharedStringsTable strings,
InputStream sheetInputStream) throws IOException, SAXException {
try {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
index f49e6de..297290b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
@@ -18,6 +18,7 @@
package org.apache.tika.parser.microsoft.ooxml.xps;
+import java.io.Closeable;
import java.io.IOException;
import org.apache.poi.ooxml.POIXMLDocument;
@@ -32,13 +33,12 @@ import org.apache.xmlbeans.XmlException;
* and keep the general framework similar to our other POI-integrated
* extractors.
*/
-public class XPSTextExtractor extends POIXMLTextExtractor {
+public class XPSTextExtractor implements POIXMLTextExtractor {
private final OPCPackage pkg;
private final POIXMLProperties properties;
public XPSTextExtractor(OPCPackage pkg) throws OpenXML4JException, XmlException, IOException {
- super((POIXMLDocument) null);
this.pkg = pkg;
this.properties = new POIXMLProperties(pkg);
@@ -54,6 +54,21 @@ public class XPSTextExtractor extends POIXMLTextExtractor {
return null;
}
+ @Override
+ public void setCloseFilesystem(boolean b) {
+
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return false;
+ }
+
+ @Override
+ public Closeable getFilesystem() {
+ return null;
+ }
+
public POIXMLProperties.CoreProperties getCoreProperties() {
return this.properties.getCoreProperties();
}
@@ -65,4 +80,9 @@ public class XPSTextExtractor extends POIXMLTextExtractor {
public POIXMLProperties.CustomProperties getCustomProperties() {
return this.properties.getCustomProperties();
}
+
+ @Override
+ public POIXMLDocument getDocument() {
+ return null;
+ }
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
index ff0fd9f..28b9845 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
@@ -17,6 +17,7 @@
package org.apache.tika.parser.microsoft.ooxml.xslf;
+import java.io.Closeable;
import java.io.IOException;
import java.util.Date;
@@ -33,7 +34,7 @@ import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
import org.apache.tika.parser.microsoft.ooxml.RunProperties;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
-public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
+public class XSLFEventBasedPowerPointExtractor implements POIXMLTextExtractor {
private OPCPackage container;
@@ -46,7 +47,6 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
public XSLFEventBasedPowerPointExtractor(OPCPackage container)
throws XmlException, OpenXML4JException, IOException {
- super((POIXMLDocument) null);
this.container = container;
this.properties = new POIXMLProperties(container);
}
@@ -80,6 +80,11 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
return this.properties.getCustomProperties();
}
+ @Override
+ public POIXMLDocument getDocument() {
+ return null;
+ }
+
@Override
public String getText() {
@@ -87,6 +92,21 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
return "";
}
+ @Override
+ public void setCloseFilesystem(boolean b) {
+
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return false;
+ }
+
+ @Override
+ public Closeable getFilesystem() {
+ return null;
+ }
+
private static class XSLFToTextContentHandler
implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 9901eb9..5b87599 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -17,6 +17,7 @@
package org.apache.tika.parser.microsoft.ooxml.xwpf;
+import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
@@ -58,7 +59,7 @@ import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
/**
* Experimental class that is based on POI's XSSFEventBasedExcelExtractor
*/
-public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
+public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor {
private static final Logger LOG = LoggerFactory.getLogger(XWPFEventBasedWordExtractor.class);
@@ -72,7 +73,6 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
public XWPFEventBasedWordExtractor(OPCPackage container)
throws XmlException, OpenXML4JException, IOException {
- super((POIXMLDocument) null);
this.container = container;
this.properties = new POIXMLProperties(container);
}
@@ -106,6 +106,11 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
return this.properties.getCustomProperties();
}
+ @Override
+ public POIXMLDocument getDocument() {
+ return null;
+ }
+
@Override
public String getText() {
@@ -152,6 +157,21 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
return sb.toString();
}
+ @Override
+ public void setCloseFilesystem(boolean b) {
+
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return false;
+ }
+
+ @Override
+ public Closeable getFilesystem() {
+ return null;
+ }
+
private void handleDocumentPart(PackagePart documentPart, StringBuilder sb)
throws IOException, SAXException {