You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/02/10 21:56:53 UTC

[tika] branch branch_1x updated: TIKA-3678 -- upgrade to POI 5.2.0

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 0f17c4c  TIKA-3678 -- upgrade to POI 5.2.0
0f17c4c is described below

commit 0f17c4c5052d2eb878631e317591cfbc799fd9c4
Author: tallison <ta...@apache.org>
AuthorDate: Thu Feb 10 16:56:37 2022 -0500

    TIKA-3678 -- upgrade to POI 5.2.0
---
 CHANGES.txt                                        |   6 +
 tika-bundle/pom.xml                                |  45 +++++++-
 tika-eval/pom.xml                                  |   5 -
 tika-eval/src/main/resources/log4j2.xml            |   5 +
 tika-parent/pom.xml                                |   2 +-
 .../tika/parser/microsoft/OutlookExtractor.java    |   7 +-
 .../tika/parser/microsoft/SummaryExtractor.java    |   1 -
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |  34 ++++--
 .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java  |  16 ++-
 .../ooxml/OOXMLWordAndPowerPointTextHandler.java   | 124 +++++++++------------
 .../ooxml/SXSLFPowerPointExtractorDecorator.java   |   4 +-
 .../ooxml/XSLFPowerPointExtractorDecorator.java    |  19 +---
 .../ooxml/XSSFExcelExtractorDecorator.java         |   5 +-
 .../microsoft/ooxml/xps/XPSTextExtractor.java      |  29 ++++-
 .../xslf/XSLFEventBasedPowerPointExtractor.java    |  61 +++++-----
 .../ooxml/xwpf/XWPFEventBasedWordExtractor.java    | 118 +++++++++++---------
 .../apache/tika/server/resource/TikaResource.java  |   2 +-
 17 files changed, 280 insertions(+), 203 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 384011e..7cab47d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,9 @@
+Release 1.28.2 - ???
+
+   * Upgrade to Apache POI 5.2.0. This is the first upgrade to POI
+      5.x and represents a major refactoring. Users will experience
+      significantly more logging (TIKA-3678).
+
 Release 1.28.1 - 2/8/2022
 
    * Security upgrades: xerces, log4j2, junrar and protobuf
diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index 617d1f1..c850756 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -57,6 +57,11 @@
       <artifactId>tika-parsers</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.logging.log4j</groupId>
+      <artifactId>log4j-api</artifactId>
+      <version>${log4j2.version}</version>
+    </dependency>
 
     <!-- Test dependencies -->
     <dependency>
@@ -201,9 +206,12 @@
               bcmail-jdk15on|
               bcprov-jdk15on|
               bcpkix-jdk15on|
-              poi|poi-scratchpad|
+              poi|
+              poi-scratchpad|
               poi-ooxml|
-              poi-ooxml-schemas|
+              poi-ooxml-lite|
+              log4j-core|
+              log4j-api|
               commons-math3|
               curvesapi|
               xmlbeans|
@@ -275,6 +283,13 @@
               com.dd.plist;resolution:=optional,
               com.adobe.xmp.properties;resolution:=optional,
               com.github.luben.zstd;resolution:=optional,
+              com.github.javaparser;resolution:=optional,
+              com.github.javaparser.ast;resolution:=optional,
+              com.github.javaparser.ast.body;resolution:=optional,
+              com.github.javaparser.ast.expr;resolution:=optional,
+              com.github.javaparser.ast.nodeTypes;resolution:=optional,
+              com.github.javaparser.ast.type;resolution:=optional,
+              com.github.javaparser.utils;resolution:=optional,
               com.github.openjson;resolution:=optional,
               com.github.jaiimageio.*;resolution:=optional,
               com.google.appengine.api.*;resolution:=optional,
@@ -287,11 +302,16 @@
               com.parso;resolution:=optional,
               com.sleepycat.je;resolution:=optional,
               com.sun.javadoc;resolution:=optional,
+              com.sun.org.apache.xalan.internal;resolution:=optional,
+              com.sun.org.apache.xml.internal.resolver;resolution:=optional,
+              com.sun.org.apache.xml.internal.resolver.tools;resolution:=optional,
+              com.sun.org.apache.xpath.internal.jaxp;resolution:=optional,
               com.sun.xml.bind.marshaller;resolution:=optional,
               com.sun.xml.internal.bind.marshaller;resolution:=optional,
               com.sun.msv.datatype;resolution:=optional,
               com.sun.msv.datatype.xsd;resolution:=optional,
               com.sun.tools.javadoc;resolution:=optional,
+              de.rototor.pdfbox.graphics2d;resolution:=optional,
               edu.mit.ll.mitie;resolution:=optional,
               edu.stanford.nlp.*;resolution:=optional,
               edu.wisc.ssec.mcidas;resolution:=optional,
@@ -321,19 +341,27 @@
               opendap.dap.parser;resolution:=optional,
               opennlp.maxent;resolution:=optional,
               opennlp.tools.namefind;resolution:=optional,
-	          opennlp.tools.authorage;resolution:=optional,
+	            opennlp.tools.authorage;resolution:=optional,
               net.didion.jwnl;resolution:=optional,
               net.sf.saxon;resolution:=optional,
               net.sf.saxon.dom;resolution:=optional,
+              net.sf.saxon.lib;resolution:=optional,
+              net.sf.saxon.ma.map;resolution:=optional,
               net.sf.saxon.om;resolution:=optional,
               net.sf.saxon.query;resolution:=optional,
               net.sf.saxon.sxpath;resolution:=optional,
+              net.sf.saxon.trans;resolution:=optional,
+              net.sf.saxon.tree.wrapper;resolution:=optional,
+              net.sf.saxon.type;resolution:=optional,
               net.sf.saxon.value;resolution:=optional,
               org.apache.batik.anim.dom;resolution:=optional,
               org.apache.batik.bridge;resolution:=optional,
+              org.apache.batik.dom;resolution:=optional,
               org.apache.batik.ext.awt;resolution:=optional,
               org.apache.batik.ext.awt.image.renderable;resolution:=optional,
               org.apache.batik.gvt;resolution:=optional,
+              org.apache.batik.parser;resolution:=optional,
+              org.apache.batik.svggen;resolution:=optional,
               org.apache.batik.util;resolution:=optional,
               org.apache.cxf.jaxrs.client;resolution:=optional,
               org.apache.cxf.jaxrs.ext.multipart;resolution:=optional,
@@ -350,6 +378,14 @@
               org.apache.commons.vfs2.util;resolution:=optional,
               org.apache.crimson.jaxp;resolution:=optional,
               org.apache.jcp.xml.dsig.internal.dom;resolution:=optional,
+              org.apache.logging.log4j;resolution:=optional,
+              org.apache.logging.log4j.util;resolution:=optional,
+              org.apache.logging.log4j.util.internal;resolution:=optional,
+              org.apache.logging.log4j.spi;resolution:=optional,
+              org.apache.maven.model;resolution:=optional,
+              org.apache.maven.plugin;resolution:=optional,
+              org.apache.maven.plugin.logging;resolution:=optional,
+              org.apache.maven.project;resolution:=optional,
               org.apache.pdfbox.debugger;resolution:=optional,
               org.apache.pdfbox.preflight.*;resolution:=optional,
               org.apache.sis;resolution:=optional,
@@ -449,6 +485,9 @@
               org.slf4j.helpers;resolution:=optional,
               org.sqlite;resolution:=optional,
               org.w3c.dom;resolution:=optional,
+              org.w3c.dom.ranges;resolution:=optional,
+              org.w3c.dom.svg;resolution:=optional,
+              org.w3c.dom.traversal;resolution:=optional,
               org.relaxng.datatype;resolution:=optional,
               org.xml.sax;resolution:=optional,
               org.xml.sax.ext;resolution:=optional,
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index bd09d5f..9aef970 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -127,11 +127,6 @@
         </dependency>
         <dependency>
             <groupId>org.apache.poi</groupId>
-            <artifactId>poi-ooxml-schemas</artifactId>
-            <version>${poi.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.poi</groupId>
             <artifactId>poi-scratchpad</artifactId>
             <version>${poi.version}</version>
         </dependency>
diff --git a/tika-eval/src/main/resources/log4j2.xml b/tika-eval/src/main/resources/log4j2.xml
index c88e66e..a4bfda9 100644
--- a/tika-eval/src/main/resources/log4j2.xml
+++ b/tika-eval/src/main/resources/log4j2.xml
@@ -28,5 +28,10 @@
     <Root level="info">
       <AppenderRef ref="Console"/>
     </Root>
+    <!-- turn this off, we know richtext isn't supported -->
+    <Logger name="org.apache.poi.xssf.streaming.SXSSFCreationHelper" level="error">
+      <AppenderRef ref="Console"/>
+    </Logger>
+
   </Loggers>
 </Configuration>
\ No newline at end of file
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index ba5b5c3..30ab125 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -272,7 +272,7 @@
     <maven.shade.version>3.2.4</maven.shade.version>
     <rat.version>0.13</rat.version>
     <!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
-    <poi.version>4.1.2</poi.version>
+    <poi.version>5.2.0</poi.version>
     <commons.compress.version>1.21</commons.compress.version>
     <commons.io.version>2.11.0</commons.io.version>
     <commons.lang3.version>3.12.0</commons.lang3.version>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index c2e27d6..cf6f51d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -700,8 +700,11 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
 
         for (RecipientChunks chunks : recipientChunks) {
             Recipient r = new Recipient();
-            r.displayName = (chunks.recipientDisplayNameChunk != null) ? chunks.recipientDisplayNameChunk.toString() : null;
-            r.name = (chunks.recipientNameChunk != null) ? chunks.recipientNameChunk.toString() : null;
+            r.displayName = (chunks.getRecipientDisplayNameChunk() != null) ?
+                    chunks.getRecipientDisplayNameChunk().toString() : null;
+            r.name = (chunks.getRecipientNameChunk() != null) ?
+                    chunks.getRecipientNameChunk().toString()
+                    : null;
             r.emailAddress = chunks.getRecipientEmailAddress();
             List<PropertyValue> vals = chunks.getProperties().get(MAPIProperty.RECIPIENT_TYPE);
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
index ba98c0e..30c472d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
@@ -24,7 +24,6 @@ import java.util.Set;
 
 import org.apache.poi.hpsf.CustomProperties;
 import org.apache.poi.hpsf.DocumentSummaryInformation;
-import org.apache.poi.hpsf.MarkUnsupportedException;
 import org.apache.poi.hpsf.NoPropertySetStreamException;
 import org.apache.poi.hpsf.PropertySet;
 import org.apache.poi.hpsf.SummaryInformation;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 63dd79f..34aeae1 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -24,8 +24,9 @@ import java.util.Locale;
 
 import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
 import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.poi.extractor.ExtractorFactory;
 import org.apache.poi.ooxml.POIXMLDocument;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
+import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
@@ -35,7 +36,7 @@ import org.apache.poi.openxml4j.opc.PackageAccess;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.util.LocaleUtil;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.extractor.XSLFExtractor;
 import org.apache.poi.xslf.usermodel.XMLSlideShow;
 import org.apache.poi.xslf.usermodel.XSLFRelation;
 import org.apache.poi.xslf.usermodel.XSLFSlideShow;
@@ -75,13 +76,24 @@ public class OOXMLExtractorFactory {
 
     private static final Logger LOG = LoggerFactory.getLogger(OOXMLExtractorFactory.class);
     private static final int MAX_BUFFER_LENGTH = 1000000;
+    private static POIXMLExtractorFactory EXTRACTOR_FACTORY = new POIXMLExtractorFactory();
+
+    //TODO find what happened to SUPPORTED_TYPES
+    private static XSLFRelation[] XSLF_RELATIONS = new XSLFRelation[] {
+            XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
+            XSLFRelation.PRESENTATIONML,
+            XSLFRelation.PRESENTATIONML_TEMPLATE, XSLFRelation.PRESENTATION_MACRO
+    };
+
+    static {
+        ExtractorFactory.setAllThreadsPreferEventExtractors(true);
+    }
 
     public static void parse(
             InputStream stream, ContentHandler baseHandler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
         Locale locale = context.get(Locale.class, LocaleUtil.getUserLocale());
-        ExtractorFactory.setThreadPrefersEventExtractors(true);
 
         //if there's a problem opening the zip file;
         //create a tmp file, and copy what you can read of it.
@@ -190,7 +202,7 @@ public class OOXMLExtractorFactory {
             }
 
             if (poiExtractor == null) {
-                poiExtractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(pkg);
+                poiExtractor = EXTRACTOR_FACTORY.create(pkg);
             }
 
             POIXMLDocument document = poiExtractor.getDocument();
@@ -215,8 +227,8 @@ public class OOXMLExtractorFactory {
                                 "The extractor returned was a " + poiExtractor
                 );
             } else if (document instanceof XMLSlideShow) {
-                extractor = new XSLFPowerPointExtractorDecorator(
-                        context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
+                extractor = new XSLFPowerPointExtractorDecorator(metadata, context,
+                        (org.apache.poi.xslf.extractor.XSLFExtractor) poiExtractor);
             } else if (document instanceof XWPFDocument) {
                 extractor = new XWPFWordExtractorDecorator( metadata,
                         context, (XWPFWordExtractor) poiExtractor);
@@ -304,15 +316,13 @@ public class OOXMLExtractorFactory {
         }
         String targetContentType = corePart.getContentType();
 
-        XSLFRelation[] xslfRelations = org.apache.poi.xslf.extractor.XSLFPowerPointExtractor.SUPPORTED_TYPES;
-
-        for (int i = 0; i < xslfRelations.length; i++) {
-            XSLFRelation xslfRelation = xslfRelations[i];
+        for (int i = 0; i < XSLF_RELATIONS.length; i++) {
+            XSLFRelation xslfRelation = XSLF_RELATIONS[i];
             if (xslfRelation.getContentType().equals(targetContentType)) {
                 if (eventBased) {
                     return new XSLFEventBasedPowerPointExtractor(pkg);
                 } else {
-                    return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
+                    return new XSLFExtractor(new XMLSlideShow(pkg));
                 }
             }
         }
@@ -321,7 +331,7 @@ public class OOXMLExtractorFactory {
             if (eventBased) {
                 return new XSLFEventBasedPowerPointExtractor(pkg);
             } else {
-                return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
+                return new XSLFExtractor(new XMLSlideShow(pkg));
             }
         }
         return null;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
index 6778280..f336435 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
@@ -33,9 +33,9 @@ import org.apache.tika.sax.XHTMLContentHandler;
 public class OOXMLTikaBodyPartHandler
         implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
 
-    private final static String P = "p";
+    private static final String P = "p";
 
-    private final static char[] NEWLINE = new char[]{'\n'};
+    private static final char[] NEWLINE = new char[]{'\n'};
 
     private final XHTMLContentHandler xhtml;
     private final XWPFListManager listManager;
@@ -83,6 +83,7 @@ public class OOXMLTikaBodyPartHandler
 
     @Override
     public void run(RunProperties runProperties, String contents) throws SAXException {
+
         // True if we are currently in the named style tag:
         if (runProperties.isBold() != isBold) {
             if (isStrikeThrough) {
@@ -92,7 +93,6 @@ public class OOXMLTikaBodyPartHandler
             if (isUnderline) {
                 xhtml.endElement("u");
                 isUnderline = false;
-                ;
             }
             if (isItalics) {
                 xhtml.endElement("i");
@@ -214,6 +214,7 @@ public class OOXMLTikaBodyPartHandler
         } else if (tableCellDepth == 0) {
             xhtml.characters(NEWLINE, 0, 1);
         }
+
         if (tableCellDepth > 0) {
             pWithinCell++;
         }
@@ -222,14 +223,18 @@ public class OOXMLTikaBodyPartHandler
 
     @Override
     public void startTable() throws SAXException {
+
         xhtml.startElement("table");
         tableDepth++;
+
     }
 
     @Override
     public void endTable() throws SAXException {
+
         xhtml.endElement("table");
         tableDepth--;
+
     }
 
     @Override
@@ -278,7 +283,7 @@ public class OOXMLTikaBodyPartHandler
     }
 
     @Override
-    public boolean getIncludeDeletedText() {
+    public boolean isIncludeDeletedText() {
         return includeDeletedText;
     }
 
@@ -301,7 +306,7 @@ public class OOXMLTikaBodyPartHandler
     }
 
     @Override
-    public boolean getIncludeMoveFromText() {
+    public boolean isIncludeMoveFromText() {
         return includeMoveFromText;
     }
 
@@ -331,6 +336,7 @@ public class OOXMLTikaBodyPartHandler
         xhtml.startElement("img", attr);
         xhtml.endElement("img");
 
+
     }
 
     @Override
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index 2cd4e31..77d0887 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -22,43 +22,36 @@ import java.util.Date;
 import java.util.Map;
 
 import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
-import org.apache.tika.parser.microsoft.OfficeParserConfig;
-import org.apache.tika.utils.DateUtils;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
+import org.apache.tika.utils.DateUtils;
+
 /**
  * This class is intended to handle anything that might contain IBodyElements:
  * main document, headers, footers, notes, slides, etc.
- *
+ * <p>
  * <p/>
- *
+ * <p>
  * This class does not generally check for namespaces, and it can be applied
  * to PPTX and DOCX for text extraction.
- *
+ * <p>
  * <p/>
  * This can be used to scrape content from charts.  It currently ignores
  * formula (&lt;c:f/&gt;) elements
- *
+ * <p>
  * <p/>
  * This does not work with .xlsx or .vsdx.
- *
+ * <p>
  * TODO: move this into POI?
- *
  */
 
 public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
 
 
-    public enum EditType {
-        NONE,
-        INSERT,
-        DELETE,
-        MOVE_TO,
-        MOVE_FROM
-    }
-
+    public final static String W_NS =
+            "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
     private final static String R = "r";
     private final static String FLD = "fld";
     private final static String RPR = "rPr";
@@ -92,21 +85,18 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
     private final static String RUBY = "ruby"; //phonetic section
     private final static String RT = "rt"; //phonetic run
     private static final String VAL = "val";
-
-
-    public final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
-    private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
+    private final static String MC_NS =
+            "http://schemas.openxmlformats.org/markup-compatibility/2006";
     private final static String O_NS = "urn:schemas-microsoft-com:office:office";
     private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
-    private final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
+    private final static String DRAWING_MAIN_NS =
+            "http://schemas.openxmlformats.org/drawingml/2006/main";
     private final static String V_NS = "urn:schemas-microsoft-com:vml";
     private final static String C_NS = "http://schemas.openxmlformats.org/drawingml/2006/chart";
-
-    private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
-
+    private final static String OFFICE_DOC_RELATIONSHIP_NS =
+            "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
     private final static char[] TAB_CHAR = new char[]{'\t'};
     private final static char NEWLINE = '\n';
-    
     private final static String BOOKMARK_START = "bookmarkStart";
     private final static String BOOKMARK_END = "bookmarkEnd";
     private final static String FOOTNOTE_REFERENCE = "footnoteReference";
@@ -117,24 +107,25 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
     private final static String MOVE_TO = "moveTo";
     private final static String ENDNOTE_REFERENCE = "endnoteReference";
     private static final String TEXTBOX = "textbox";
-
-
     private final XWPFBodyContentsHandler bodyContentsHandler;
-
     private final Map<String, String> linkedRelationships;
-
-    private boolean inR = false;//in run or in field. TODO: convert this to an integer because you can have a run within a run
+    private final RunProperties currRunProperties = new RunProperties();
+    private final ParagraphProperties currPProperties = new ParagraphProperties();
+    private final boolean includeTextBox;
+    private final boolean concatenatePhoneticRuns;
+    private final StringBuilder runBuffer = new StringBuilder();
+    private final StringBuilder rubyBuffer = new StringBuilder();
+    private boolean inR = false;
+    //in run or in field. TODO: convert this to an integer because you can have a run within a run
     private boolean inT = false;
     private boolean inRPr = false;
     private boolean inNumPr = false;
     private boolean inRt = false;
-
     private boolean inPic = false;
     private boolean inPict = false;
     private String picDescription = null;
     private String picRId = null;
     private String picFilename = null;
-
     //mechanism used to determine when to
     //signal the start of the p, and still
     //handle p with pPr and those without
@@ -143,28 +134,18 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
     //pPr can happen multiple times within a p
     //<p><pPr/><r><t>text</t></r><pPr></p>
     private boolean pStarted = false;
-
     //alternate content can be embedded in itself.
     //need to track depth.
     //if in alternate, choose fallback, maybe make this configurable?
     private int inACChoiceDepth = 0;
     private int inACFallbackDepth = 0;
-
-    private final RunProperties currRunProperties = new RunProperties();
-    private final ParagraphProperties currPProperties = new ParagraphProperties();
-    private final boolean includeTextBox;
-    private final boolean concatenatePhoneticRuns;
-    private final StringBuilder runBuffer = new StringBuilder();
-    private final StringBuilder rubyBuffer = new StringBuilder();//buffers rt in ruby sections (see 17.3.3.25)
-
-
     private boolean inDelText = false;
+    //buffers rt in ruby sections (see 17.3.3.25)
     private boolean inHlinkClick = false;
     private boolean inTextBox = false;
     private boolean inV = false; //in c:v in chart file
-
-    private OOXMLWordAndPowerPointTextHandler.EditType editType = OOXMLWordAndPowerPointTextHandler.EditType.NONE;
-
+    private OOXMLWordAndPowerPointTextHandler.EditType editType =
+            OOXMLWordAndPowerPointTextHandler.EditType.NONE;
     private DateUtils dateUtils = new DateUtils();
 
     public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler,
@@ -172,16 +153,15 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
         this(bodyContentsHandler, hyperlinks, true, true);
     }
 
-
     public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler,
-                                             Map<String, String> hyperlinks, boolean includeTextBox, boolean concatenatePhoneticRuns) {
+                                             Map<String, String> hyperlinks, boolean includeTextBox,
+                                             boolean concatenatePhoneticRuns) {
         this.bodyContentsHandler = bodyContentsHandler;
         this.linkedRelationships = hyperlinks;
         this.includeTextBox = includeTextBox;
         this.concatenatePhoneticRuns = concatenatePhoneticRuns;
     }
 
-
     @Override
     public void startDocument() throws SAXException {
     }
@@ -199,10 +179,11 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
     }
 
     @Override
-    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+    public void startElement(String uri, String localName, String qName, Attributes atts)
+            throws SAXException {
         //TODO: checkBox, textBox, sym, headerReference, footerReference, commentRangeEnd
 
-        if (lastStartElementWasP && ! PPR.equals(localName)) {
+        if (lastStartElementWasP && !PPR.equals(localName)) {
             bodyContentsHandler.startParagraph(currPProperties);
         }
 
@@ -220,7 +201,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
             return;
         }
 
-        if (! includeTextBox && localName.equals(TEXTBOX)) {
+        if (!includeTextBox && localName.equals(TEXTBOX)) {
             inTextBox = true;
             return;
         }
@@ -238,7 +219,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
         } else if (P.equals(localName)) {
             lastStartElementWasP = true;
         } else if (B.equals(localName)) { //TODO: add bCs
-            if(inR && inRPr) {
+            if (inR && inRPr) {
                 currRunProperties.setBold(true);
             }
         } else if (TC.equals(localName)) {
@@ -271,7 +252,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
             if (inNumPr) {
                 currPProperties.setNumId(getIntVal(atts));
             }
-        } else if(BR.equals(localName)) {
+        } else if (BR.equals(localName)) {
             runBuffer.append(NEWLINE);
         } else if (BOOKMARK_START.equals(localName)) {
             String name = atts.getValue(W_NS, "name");
@@ -301,7 +282,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
                 bodyContentsHandler.hyperlinkStart(hyperlink);
                 inHlinkClick = true;
             }
-        } else if(TBL.equals(localName)) {
+        } else if (TBL.equals(localName)) {
             bodyContentsHandler.startTable();
         } else if (BLIP.equals(localName)) { //check for DRAWING_NS
             picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
@@ -326,7 +307,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
             startEditedSection(EditType.MOVE_TO, atts);
         } else if (MOVE_FROM.equals(localName)) {
             startEditedSection(editType.MOVE_FROM, atts);
-        } else if (OLE_OBJECT.equals(localName)){ //check for O_NS?
+        } else if (OLE_OBJECT.equals(localName)) { //check for O_NS?
             String type = null;
             String refId = null;
             //TODO: clean this up and ...want to get ProgID?
@@ -335,14 +316,15 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
                 String attValue = atts.getValue(i);
                 if (attLocalName.equals("Type")) {
                     type = attValue;
-                } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) && attLocalName.equals("id")) {
+                } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) &&
+                        attLocalName.equals("id")) {
                     refId = attValue;
                 }
             }
             if ("Embed".equals(type)) {
                 bodyContentsHandler.embeddedOLERef(refId);
             }
-        } else if(CR.equals(localName)) {
+        } else if (CR.equals(localName)) {
             runBuffer.append(NEWLINE);
         } else if (ENDNOTE_REFERENCE.equals(localName)) {
             String id = atts.getValue(W_NS, "id");
@@ -386,7 +368,6 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
         return -1;
     }
 
-
     @Override
     public void endElement(String uri, String localName, String qName) throws SAXException {
 
@@ -399,7 +380,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
             return;
         }
 
-        if (! includeTextBox && localName.equals(TEXTBOX)) {
+        if (!includeTextBox && localName.equals(TEXTBOX)) {
             inTextBox = false;
             return;
         }
@@ -438,8 +419,8 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
             handleEndOfRun();
         } else if (DEL_TEXT.equals(localName)) {
             inDelText = false;
-        } else if (INS.equals(localName) || DEL.equals(localName) ||
-                MOVE_TO.equals(localName) || MOVE_FROM.equals(localName)) {
+        } else if (INS.equals(localName) || DEL.equals(localName) || MOVE_TO.equals(localName) ||
+                MOVE_FROM.equals(localName)) {
             editType = EditType.NONE;
         } else if (HYPERLINK.equals(localName)) {
             bodyContentsHandler.hyperlinkEnd();
@@ -447,7 +428,6 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
             handlePict();
         } else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a chart
             inV = false;
-            appendToBuffer(TAB_CHAR, 0, 1);
             handleEndOfRun();
         } else if (RT.equals(localName)) {
             inRt = false;
@@ -492,22 +472,24 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
 
     @Override
     public void characters(char[] ch, int start, int length) throws SAXException {
+
         if (inACChoiceDepth > 0) {
             return;
-        } else if (! includeTextBox && inTextBox) {
+        } else if (!includeTextBox && inTextBox) {
             return;
         }
 
         if (editType.equals(EditType.MOVE_FROM) && inT) {
-            if (bodyContentsHandler.getIncludeMoveFromText()) {
+            if (bodyContentsHandler.isIncludeMoveFromText()) {
                 appendToBuffer(ch, start, length);
             }
         } else if (inT) {
             appendToBuffer(ch, start, length);
-        } else if (bodyContentsHandler.getIncludeDeletedText() && editType.equals(EditType.DELETE)) {
+        } else if (bodyContentsHandler.isIncludeDeletedText() && editType.equals(EditType.DELETE)) {
             appendToBuffer(ch, start, length);
         } else if (inV) {
             appendToBuffer(ch, start, length);
+            appendToBuffer(TAB_CHAR, 0, 1);
         }
     }
 
@@ -515,13 +497,13 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
     public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
         if (inACChoiceDepth > 0) {
             return;
-        } else if (! includeTextBox && inTextBox) {
+        } else if (!includeTextBox && inTextBox) {
             return;
         }
 
         if (inT) {
             appendToBuffer(ch, start, length);
-        } else if (bodyContentsHandler.getIncludeDeletedText() && inDelText) {
+        } else if (bodyContentsHandler.isIncludeDeletedText() && inDelText) {
             appendToBuffer(ch, start, length);
         }
     }
@@ -534,6 +516,10 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
         }
     }
 
+    public enum EditType {
+        NONE, INSERT, DELETE, MOVE_TO, MOVE_FROM
+    }
+
     public interface XWPFBodyContentsHandler {
 
         void run(RunProperties runProperties, String contents) throws SAXException;
@@ -569,13 +555,13 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
 
         void endEditedSection() throws SAXException;
 
-        boolean getIncludeDeletedText();
+        boolean isIncludeDeletedText() throws SAXException;
 
         void footnoteReference(String id) throws SAXException;
 
         void endnoteReference(String id) throws SAXException;
 
-        boolean getIncludeMoveFromText();
+        boolean isIncludeMoveFromText() throws SAXException;
 
         void embeddedOLERef(String refId) throws SAXException;
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index ac6e278..5350f30 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -33,7 +33,7 @@ import org.apache.poi.openxml4j.opc.PackageRelationship;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
 import org.apache.poi.openxml4j.opc.TargetMode;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.extractor.XSLFExtractor;
 import org.apache.poi.xslf.usermodel.XSLFRelation;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
@@ -95,7 +95,7 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
     }
 
     /**
-     * @see XSLFPowerPointExtractor#getText()
+     * @see XSLFExtractor#getText()
      */
     protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
index c63fcb3..6a496b4 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
@@ -30,13 +30,11 @@ import org.apache.poi.openxml4j.opc.PackageRelationship;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
 import org.apache.poi.openxml4j.opc.TargetMode;
-import org.apache.poi.sl.extractor.SlideShowExtractor;
 import org.apache.poi.sl.usermodel.Placeholder;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.extractor.XSLFExtractor;
 import org.apache.poi.xslf.usermodel.XMLSlideShow;
 import org.apache.poi.xslf.usermodel.XSLFComment;
 import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
-import org.apache.poi.xslf.usermodel.XSLFComments;
 import org.apache.poi.xslf.usermodel.XSLFGraphicFrame;
 import org.apache.poi.xslf.usermodel.XSLFGroupShape;
 import org.apache.poi.xslf.usermodel.XSLFHyperlink;
@@ -73,23 +71,14 @@ public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
 
     private Metadata metadata;
 
-    public XSLFPowerPointExtractorDecorator(Metadata metadata, ParseContext context, XSLFPowerPointExtractor extractor) {
+    public XSLFPowerPointExtractorDecorator(Metadata metadata, ParseContext context,
+                                            XSLFExtractor extractor) {
         super(context, extractor);
         this.metadata = metadata;
     }
 
     /**
-     * use {@link XSLFPowerPointExtractorDecorator#XSLFPowerPointExtractorDecorator(Metadata, ParseContext, XSLFPowerPointExtractor)}
-     * @param context
-     * @param extractor
-     */
-    @Deprecated
-    public XSLFPowerPointExtractorDecorator(ParseContext context, XSLFPowerPointExtractor extractor) {
-        this(new Metadata(),context, extractor);
-    }
-
-    /**
-     * @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText()
+     * @see XSLFExtractor#getText()
      */
     protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
         XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 64ff7b1..f3b8de4 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -46,6 +46,7 @@ import org.apache.poi.xssf.eventusermodel.XSSFReader;
 import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
 import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
+import org.apache.poi.xssf.model.Comments;
 import org.apache.poi.xssf.model.CommentsTable;
 import org.apache.poi.xssf.model.StylesTable;
 import org.apache.poi.xssf.usermodel.XSSFComment;
@@ -161,7 +162,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
                 addDrawingHyperLinks(sheetPart);
                 sheetParts.add(sheetPart);
 
-                CommentsTable comments = iter.getSheetComments();
+                Comments comments = iter.getSheetComments();
 
                 // Start, and output the sheet name
                 xhtml.startElement("div");
@@ -346,7 +347,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
 
     public void processSheet(
             SheetContentsHandler sheetContentsExtractor,
-            CommentsTable comments,
+            Comments comments,
             StylesTable styles,
             ReadOnlySharedStringsTable strings,
             InputStream sheetInputStream)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
index 0212920..297290b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSTextExtractor.java
@@ -18,6 +18,9 @@
 package org.apache.tika.parser.microsoft.ooxml.xps;
 
 
+import java.io.Closeable;
+import java.io.IOException;
+
 import org.apache.poi.ooxml.POIXMLDocument;
 import org.apache.poi.ooxml.POIXMLProperties;
 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
@@ -25,20 +28,17 @@ import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.xmlbeans.XmlException;
 
-import java.io.IOException;
-
 /**
  * Currently, mostly a pass-through class to hold pkg and properties
  * and keep the general framework similar to our other POI-integrated
  * extractors.
  */
-public class XPSTextExtractor extends POIXMLTextExtractor {
+public class XPSTextExtractor implements POIXMLTextExtractor {
 
     private final OPCPackage pkg;
     private final POIXMLProperties properties;
 
     public XPSTextExtractor(OPCPackage pkg) throws OpenXML4JException, XmlException, IOException {
-        super((POIXMLDocument)null);
         this.pkg = pkg;
         this.properties = new POIXMLProperties(pkg);
 
@@ -53,6 +53,22 @@ public class XPSTextExtractor extends POIXMLTextExtractor {
     public String getText() {
         return null;
     }
+
+    @Override
+    public void setCloseFilesystem(boolean b) {
+
+    }
+
+    @Override
+    public boolean isCloseFilesystem() {
+        return false;
+    }
+
+    @Override
+    public Closeable getFilesystem() {
+        return null;
+    }
+
     public POIXMLProperties.CoreProperties getCoreProperties() {
         return this.properties.getCoreProperties();
     }
@@ -64,4 +80,9 @@ public class XPSTextExtractor extends POIXMLTextExtractor {
     public POIXMLProperties.CustomProperties getCustomProperties() {
         return this.properties.getCustomProperties();
     }
+
+    @Override
+    public POIXMLDocument getDocument() {
+        return null;
+    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
index bd5615d..46ada51 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
@@ -17,6 +17,7 @@
 
 package org.apache.tika.parser.microsoft.ooxml.xslf;
 
+import java.io.Closeable;
 import java.io.IOException;
 import java.util.Date;
 
@@ -25,41 +26,24 @@ import org.apache.poi.ooxml.POIXMLProperties;
 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.xmlbeans.XmlException;
+
 import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
 import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
 import org.apache.tika.parser.microsoft.ooxml.RunProperties;
-import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
-import org.apache.xmlbeans.XmlException;
 
-public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
+public class XSLFEventBasedPowerPointExtractor implements POIXMLTextExtractor {
 
 
     private OPCPackage container;
     private POIXMLProperties properties;
 
-    public XSLFEventBasedPowerPointExtractor(String path) throws XmlException, OpenXML4JException, IOException {
-        this(OPCPackage.open(path));
-    }
-
-    public XSLFEventBasedPowerPointExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
-        super((POIXMLDocument) null);
+    public XSLFEventBasedPowerPointExtractor(OPCPackage container)
+            throws XmlException, OpenXML4JException, IOException {
         this.container = container;
         this.properties = new POIXMLProperties(container);
     }
 
-
-    public static void main(String[] args) throws Exception {
-        if (args.length < 1) {
-            System.err.println("Use:");
-            System.err.println("  XSLFEventBasedPowerPointExtractor <filename.pptx>");
-            System.exit(1);
-        }
-
-        XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(args[0]);
-        System.out.println(extractor.getText());
-        extractor.close();
-    }
-
     public OPCPackage getPackage() {
         return this.container;
     }
@@ -76,6 +60,11 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
         return this.properties.getCustomProperties();
     }
 
+    @Override
+    public POIXMLDocument getDocument() {
+        return null;
+    }
+
 
     @Override
     public String getText() {
@@ -83,9 +72,28 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
         return "";
     }
 
+    @Override
+    public void setCloseFilesystem(boolean b) {
+
+    }
+
+    @Override
+    public boolean isCloseFilesystem() {
+        return false;
+    }
+
+    @Override
+    public Closeable getFilesystem() {
+        return null;
+    }
 
+    @Override
+    public void close() throws IOException {
+        getPackage().revert();
+    }
 
-    private class XSLFToTextContentHandler implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
+    private static class XSLFToTextContentHandler
+            implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
         private final StringBuilder buffer;
 
         public XSLFToTextContentHandler(StringBuilder buffer) {
@@ -158,7 +166,8 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
         }
 
         @Override
-        public void startEditedSection(String editor, Date date, OOXMLWordAndPowerPointTextHandler.EditType editType) {
+        public void startEditedSection(String editor, Date date,
+                                       OOXMLWordAndPowerPointTextHandler.EditType editType) {
 
         }
 
@@ -168,7 +177,7 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
         }
 
         @Override
-        public boolean getIncludeDeletedText() {
+        public boolean isIncludeDeletedText() {
             return false;
         }
 
@@ -183,7 +192,7 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
         }
 
         @Override
-        public boolean getIncludeMoveFromText() {
+        public boolean isIncludeMoveFromText() {
             return false;
         }
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index e39dfd4..fd8f764 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -17,7 +17,7 @@
 
 package org.apache.tika.parser.microsoft.ooxml.xwpf;
 
-import javax.xml.parsers.ParserConfigurationException;
+import java.io.Closeable;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Date;
@@ -29,7 +29,6 @@ import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.poi.ooxml.POIXMLDocument;
 import org.apache.poi.ooxml.POIXMLProperties;
 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
-import org.apache.poi.ooxml.util.SAXHelper;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
@@ -38,55 +37,39 @@ import org.apache.poi.openxml4j.opc.PackageRelationship;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.xwpf.usermodel.XWPFNumbering;
 import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.xmlbeans.XmlException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
 
 import org.apache.tika.exception.RuntimeSAXException;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.exception.WriteLimitReachedException;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
 import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
 import org.apache.tika.parser.microsoft.ooxml.RunProperties;
 import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
-import org.apache.xmlbeans.XmlException;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-import org.xml.sax.XMLReader;
+import org.apache.tika.utils.XMLReaderUtils;
 
 //TODO: move this into POI?
+
 /**
  * Experimental class that is based on POI's XSSFEventBasedExcelExtractor
- *
  */
-public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
+public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor {
 
     private static final Logger LOG = LoggerFactory.getLogger(XWPFEventBasedWordExtractor.class);
 
     private OPCPackage container;
     private POIXMLProperties properties;
 
-    public XWPFEventBasedWordExtractor(String path) throws XmlException, OpenXML4JException, IOException {
-        this(OPCPackage.open(path));
-    }
-
-    public XWPFEventBasedWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
-        super((POIXMLDocument) null);
+    public XWPFEventBasedWordExtractor(OPCPackage container)
+            throws XmlException, OpenXML4JException, IOException {
         this.container = container;
         this.properties = new POIXMLProperties(container);
     }
 
-
-    public static void main(String[] args) throws Exception {
-        if (args.length < 1) {
-            System.err.println("Use:");
-            System.err.println("  XWPFEventBasedWordExtractor <filename.xlsx>");
-            System.exit(1);
-        }
-
-        XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(args[0]);
-        System.out.println(extractor.getText());
-        extractor.close();
-    }
-
     public OPCPackage getPackage() {
         return this.container;
     }
@@ -103,12 +86,18 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         return this.properties.getCustomProperties();
     }
 
+    @Override
+    public POIXMLDocument getDocument() {
+        return null;
+    }
+
 
     @Override
     public String getText() {
         StringBuilder sb = new StringBuilder();
         //handle main document
-        List<PackagePart> pps = container.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+        List<PackagePart> pps =
+                container.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
         if (pps != null) {
             for (PackagePart pp : pps) {
                 //likely only one, but why not...
@@ -122,6 +111,8 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
                     }
                     //swallow this because we don't actually call it
                     LOG.warn("SAXException handling document part", e);
+                } catch (TikaException e) {
+                    LOG.warn("ParseException handling document part", e);
                 }
             }
         }
@@ -141,6 +132,8 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
                     }
                     //swallow this because we don't actually call it
                     LOG.warn("SAXException handling glossary document part", e);
+                } catch (TikaException e) {
+                    LOG.warn("ParseException handling document part", e);
                 }
             }
         }
@@ -148,8 +141,24 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         return sb.toString();
     }
 
+    @Override
+    public void setCloseFilesystem(boolean b) {
+
+    }
+
+    @Override
+    public boolean isCloseFilesystem() {
+        return false;
+    }
+
+    @Override
+    public Closeable getFilesystem() {
+        return null;
+    }
 
-    private void handleDocumentPart(PackagePart documentPart, StringBuilder sb) throws IOException, SAXException {
+
+    private void handleDocumentPart(PackagePart documentPart, StringBuilder sb)
+            throws IOException, SAXException, TikaException {
         //load the numbering/list manager and styles from the main document part
         XWPFNumbering numbering = loadNumbering(documentPart);
         XWPFListManager xwpfListManager = new XWPFListManager(numbering);
@@ -157,7 +166,8 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
 
         //headers
         try {
-            PackageRelationshipCollection headersPRC = documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation());
+            PackageRelationshipCollection headersPRC =
+                    documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation());
             if (headersPRC != null) {
                 for (int i = 0; i < headersPRC.size(); i++) {
                     PackagePart header = documentPart.getRelatedPart(headersPRC.getRelationship(i));
@@ -172,17 +182,15 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         handlePart(documentPart, xwpfListManager, sb);
 
         //for now, just dump other components at end
-        for (XWPFRelation rel : new XWPFRelation[]{
-                XWPFRelation.FOOTNOTE,
-                XWPFRelation.COMMENT,
-                XWPFRelation.FOOTER,
-                XWPFRelation.ENDNOTE
-        }) {
+        for (XWPFRelation rel : new XWPFRelation[]{XWPFRelation.FOOTNOTE, XWPFRelation.COMMENT,
+                XWPFRelation.FOOTER, XWPFRelation.ENDNOTE}) {
             try {
-                PackageRelationshipCollection prc = documentPart.getRelationshipsByType(rel.getRelation());
+                PackageRelationshipCollection prc =
+                        documentPart.getRelationshipsByType(rel.getRelation());
                 if (prc != null) {
                     for (int i = 0; i < prc.size(); i++) {
-                        PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
+                        PackagePart packagePart =
+                                documentPart.getRelatedPart(prc.getRelationship(i));
                         handlePart(packagePart, xwpfListManager, sb);
                     }
                 }
@@ -192,18 +200,14 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         }
     }
 
-    private void handlePart(PackagePart packagePart,
-                            XWPFListManager xwpfListManager, StringBuilder buffer) throws IOException, SAXException {
+    private void handlePart(PackagePart packagePart, XWPFListManager xwpfListManager,
+                            StringBuilder buffer) throws IOException, SAXException, TikaException {
 
         Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
         try (InputStream stream = packagePart.getInputStream()) {
-            XMLReader reader = SAXHelper.newXMLReader();
-            reader.setContentHandler(new OOXMLWordAndPowerPointTextHandler(
-                    new XWPFToTextContentHandler(buffer), hyperlinks));
-            reader.parse(new InputSource(new CloseShieldInputStream(stream)));
-
-        } catch (ParserConfigurationException e) {
-            LOG.warn("Can't configure XMLReader", e);
+            XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream),
+                    new OOXMLWordAndPowerPointTextHandler(new XWPFToTextContentHandler(buffer),
+                            hyperlinks), new ParseContext());
         }
 
     }
@@ -211,7 +215,8 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
     private Map<String, String> loadHyperlinkRelationships(PackagePart bodyPart) {
         Map<String, String> hyperlinks = new HashMap<>();
         try {
-            PackageRelationshipCollection prc = bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
+            PackageRelationshipCollection prc =
+                    bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
             for (int i = 0; i < prc.size(); i++) {
                 PackageRelationship pr = prc.getRelationship(i);
                 if (pr == null) {
@@ -231,7 +236,8 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
 
     private XWPFNumbering loadNumbering(PackagePart packagePart) throws IOException {
         try {
-            PackageRelationshipCollection numberingParts = packagePart.getRelationshipsByType(XWPFRelation.NUMBERING.getRelation());
+            PackageRelationshipCollection numberingParts =
+                    packagePart.getRelationshipsByType(XWPFRelation.NUMBERING.getRelation());
             if (numberingParts.size() > 0) {
                 PackageRelationship numberingRelationShip = numberingParts.getRelationship(0);
                 if (numberingRelationShip == null) {
@@ -249,7 +255,8 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         return null;
     }
 
-    private class XWPFToTextContentHandler implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
+    private static class XWPFToTextContentHandler
+            implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
         private final StringBuilder buffer;
 
         public XWPFToTextContentHandler(StringBuilder buffer) {
@@ -322,7 +329,8 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         }
 
         @Override
-        public void startEditedSection(String editor, Date date, OOXMLWordAndPowerPointTextHandler.EditType editType) {
+        public void startEditedSection(String editor, Date date,
+                                       OOXMLWordAndPowerPointTextHandler.EditType editType) {
 
         }
 
@@ -332,7 +340,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         }
 
         @Override
-        public boolean getIncludeDeletedText() {
+        public boolean isIncludeDeletedText() {
             return true;
         }
 
@@ -347,7 +355,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         }
 
         @Override
-        public boolean getIncludeMoveFromText() {
+        public boolean isIncludeMoveFromText() {
             return false;
         }
 
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index b326f7f..c77694d 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -22,7 +22,6 @@ import org.apache.commons.lang3.StringUtils;
 import org.apache.cxf.attachment.ContentDisposition;
 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
 import org.apache.cxf.jaxrs.impl.MetadataMap;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
 import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
@@ -55,6 +54,7 @@ import org.apache.tika.server.ServerStatus;
 import org.apache.tika.server.TikaServerParseException;
 import org.apache.tika.utils.ExceptionUtils;
 
+import org.apache.poi.extractor.ExtractorFactory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;