You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/20 18:16:32 UTC

[2/2] tika git commit: TIKA-2220 - refactor new sax pptx and docx to reduce code duplication.

TIKA-2220 - refactor new sax pptx and docx to reduce code duplication.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/376318fc
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/376318fc
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/376318fc

Branch: refs/heads/master
Commit: 376318fc1b34014ec31d5fbfdfa962183ea8c717
Parents: ca37313
Author: tballison <ta...@mitre.org>
Authored: Tue Dec 20 13:16:20 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 20 13:16:20 2016 -0500

----------------------------------------------------------------------
 .../ooxml/AbstractDocumentXMLBodyHandler.java   |  99 ----
 .../ooxml/OOXMLTikaBodyPartHandler.java         | 397 +++++++++++++++
 .../OOXMLWordAndPowerPointTextHandler.java      | 497 +++++++++++++++++++
 .../SXSLFPowerPointExtractorDecorator.java      |  30 +-
 .../ooxml/SXWPFWordExtractorDecorator.java      |   6 +-
 .../parser/microsoft/ooxml/XWPFListManager.java |  23 +
 .../ooxml/xslf/XSLFDocumentXMLBodyHandler.java  | 330 ------------
 .../xslf/XSLFEventBasedPowerPointExtractor.java |  53 +-
 .../ooxml/xslf/XSLFTikaBodyPartHandler.java     | 262 ----------
 .../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java  | 388 ---------------
 .../ooxml/xwpf/XWPFEventBasedWordExtractor.java |   7 +-
 .../microsoft/ooxml/xwpf/XWPFStylesShim.java    |  27 +-
 .../ooxml/xwpf/XWPFTikaBodyPartHandler.java     | 376 --------------
 .../ooxml/xwpf/ml2006/BodyPartHandler.java      |  64 ---
 .../ooxml/xwpf/ml2006/Word2006MLDocHandler.java |  16 +-
 .../WordAndPowerPointTextPartHandler.java       |  64 +++
 .../microsoft/ooxml/SXWPFExtractorTest.java     |   2 +-
 17 files changed, 1084 insertions(+), 1557 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractDocumentXMLBodyHandler.java
deleted file mode 100644
index 5037fd2..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractDocumentXMLBodyHandler.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml;
-
-
-import org.xml.sax.helpers.DefaultHandler;
-
-public class AbstractDocumentXMLBodyHandler extends DefaultHandler {
-
-    protected final static String R = "r";
-    protected final static String FLD = "fld";
-    protected final static String RPR = "rPr";
-    protected final static String P = "p";
-    protected static String P_STYLE = "pStyle";
-    protected final static String PPR = "pPr";
-    protected static String T = "t";
-    protected final static String TAB = "tab";
-    protected final static String B = "b";
-    protected final static String ILVL = "ilvl";
-    protected final static String NUM_ID = "numId";
-    protected final static String TC = "tc";
-    protected final static String TR = "tr";
-    protected final static String I = "i";
-    protected final static String NUM_PR = "numPr";
-    protected final static String BR = "br";
-    protected final static String HYPERLINK = "hyperlink";
-    protected final static String TBL = "tbl";
-    protected final static String PIC = "pic";
-    protected final static String PICT = "pict";
-    protected final static String IMAGEDATA = "imagedata";
-    protected final static String BLIP = "blip";
-    protected final static String CHOICE = "Choice";
-    protected final static String FALLBACK = "Fallback";
-    protected final static String OLE_OBJECT = "OLEObject";
-    protected final static String CR = "cr";
-
-    public final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
-    protected final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
-    protected final static String O_NS = "urn:schemas-microsoft-com:office:office";
-    protected final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
-    protected final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
-    protected final static String V_NS = "urn:schemas-microsoft-com:vml";
-
-    protected final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
-
-    protected final static char[] TAB_CHAR = new char[1];
-    protected final static char NEWLINE = '\n';
-
-    static {
-        TAB_CHAR[0] = '\t';
-    }
-
-    protected boolean inR = false;//in run or in field
-    protected boolean inT = false;
-    protected boolean inRPr = false;
-    protected boolean inNumPr = false;
-
-    protected boolean inPic = false;
-    boolean inPict = false;
-    protected String picDescription = null;
-    protected String picRId = null;
-    String picFilename = null;
-
-    //mechanism used to determine when to
-    //signal the start of the p, and still
-    //handle p with pPr and those without
-    protected boolean lastStartElementWasP = false;
-    //have we signaled the start of a p?
-    //pPr can happen multiple times within a p
-    //<p><pPr/><r><t>text</t></r><pPr></p>
-    protected boolean pStarted = false;
-
-    //alternate content can be embedded in itself.
-    //need to track depth.
-    //if in alternate, choose fallback, maybe make this configurable?
-    protected int inACChoiceDepth = 0;
-    protected int inACFallbackDepth = 0;
-
-    protected RunProperties currRunProperties = new RunProperties();
-    protected ParagraphProperties currPProperties = new ParagraphProperties();
-
-    protected final StringBuilder runBuffer = new StringBuilder();
-
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
new file mode 100644
index 0000000..ef3b3dc
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
@@ -0,0 +1,397 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml;
+
+
+import java.math.BigInteger;
+import java.util.Date;
+
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.WordExtractor;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+public class OOXMLTikaBodyPartHandler implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
+
+    private final static String P = "p";
+
+    private final static char[] NEWLINE = new char[]{'\n'};
+
+    private final XHTMLContentHandler xhtml;
+    private final XWPFListManager listManager;
+    private final boolean includeDeletedText;
+    private final boolean includeMoveFromText;
+    private final XWPFStylesShim styles;
+
+    private int pDepth = 0; //paragraph depth
+    private int tableDepth = 0;//table depth
+    private int sdtDepth = 0;//
+    private boolean isItalics = false;
+    private boolean isBold = false;
+    private boolean wroteHyperlinkStart = false;
+
+    //TODO: fix this
+    //pWithinCell should be an array/stack of given cell depths
+    //so that when you get to the end of an embedded table, e.g.,
+    //you know what your paragraph count was in the parent cell.
+    //<tc><p/><p/><table><tr><tc></p></p></tc></tr></table>...
+    private int tableCellDepth = 0;
+    private int pWithinCell = 0;
+
+    //will need to replace this with a stack
+    //if we're marking more that the first level <p/> element
+    private String paragraphTag = null;
+
+    public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml) {
+        this.xhtml = xhtml;
+        this.styles = XWPFStylesShim.EMPTY_STYLES;
+        this.listManager = XWPFListManager.EMPTY_LIST;
+        this.includeDeletedText = false;
+        this.includeMoveFromText = false;
+    }
+
+    public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml, XWPFStylesShim styles, XWPFListManager listManager, OfficeParserConfig parserConfig) {
+        this.xhtml = xhtml;
+        this.styles = styles;
+        this.listManager = listManager;
+        this.includeDeletedText = parserConfig.getIncludeDeletedContent();
+        this.includeMoveFromText = parserConfig.getIncludeMoveFromContent();
+    }
+
+    @Override
+    public void run(RunProperties runProperties, String contents) {
+        try {
+            // True if we are currently in the named style tag:
+            if (runProperties.getBold() != isBold) {
+                if (isItalics) {
+                    xhtml.endElement("i");
+                    isItalics = false;
+                }
+                if (runProperties.getBold()) {
+                    xhtml.startElement("b");
+                    isBold = true;
+                } else {
+                    xhtml.endElement("b");
+                    isBold = false;
+                }
+            }
+
+            if (runProperties.getItalics() != isItalics) {
+                if (runProperties.getItalics()) {
+                    xhtml.startElement("i");
+                    isItalics = true;
+                } else {
+                    xhtml.endElement("i");
+                    isItalics = false;
+                }
+            }
+
+            xhtml.characters(contents);
+
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void hyperlinkStart(String link) {
+        try {
+            if (link != null) {
+                xhtml.startElement("a", "href", link);
+                wroteHyperlinkStart = true;
+            }
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void hyperlinkEnd() {
+        try {
+            if (wroteHyperlinkStart) {
+                closeStyleTags();
+                wroteHyperlinkStart = false;
+                xhtml.endElement("a");
+            }
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void startParagraph(ParagraphProperties paragraphProperties) {
+        if (pDepth == 0 && tableDepth == 0 && sdtDepth == 0) {
+            paragraphTag = P;
+            String styleClass = null;
+            //TIKA-2144 check that styles is not null
+            if (paragraphProperties.getStyleID() != null && styles != null) {
+                String styleName = styles.getStyleName(
+                        paragraphProperties.getStyleID()
+                );
+                if (styleName != null) {
+                    WordExtractor.TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(
+                            styleName, false);
+                    paragraphTag = tas.getTag();
+                    styleClass = tas.getStyleClass();
+                }
+            }
+
+
+            try {
+                if (styleClass == null) {
+                    xhtml.startElement(paragraphTag);
+                } else {
+                    xhtml.startElement(paragraphTag, "class", styleClass);
+                }
+            } catch (SAXException e) {
+
+            }
+        }
+
+        try {
+            writeParagraphNumber(paragraphProperties.getNumId(),
+                    paragraphProperties.getIlvl(), listManager, xhtml);
+        } catch (SAXException e) {
+
+        }
+        pDepth++;
+    }
+
+
+    @Override
+    public void endParagraph() {
+        try {
+            closeStyleTags();
+            if (pDepth == 1 && tableDepth == 0) {
+                xhtml.endElement(paragraphTag);
+            } else if (tableCellDepth > 0 && pWithinCell > 0){
+                xhtml.characters(NEWLINE, 0, 1);
+            } else if (tableCellDepth == 0) {
+                xhtml.characters(NEWLINE, 0, 1);
+            }
+        } catch (SAXException e) {
+
+        }
+        if (tableCellDepth > 0) {
+            pWithinCell++;
+        }
+        pDepth--;
+    }
+
+    @Override
+    public void startTable() {
+        try {
+            xhtml.startElement("table");
+            tableDepth++;
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void endTable() {
+        try {
+            xhtml.endElement("table");
+            tableDepth--;
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void startTableRow() {
+        try {
+            xhtml.startElement("tr");
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void endTableRow() {
+        try {
+            xhtml.endElement("tr");
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void startTableCell() {
+        try {
+            xhtml.startElement("td");
+        } catch (SAXException e) {
+
+        }
+        tableCellDepth++;
+    }
+
+    @Override
+    public void endTableCell() {
+        try {
+            xhtml.endElement("td");
+        } catch (SAXException e) {
+
+        }
+        pWithinCell = 0;
+        tableCellDepth--;
+    }
+
+    @Override
+    public void startSDT() {
+        try {
+            closeStyleTags();
+            sdtDepth++;
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void endSDT() {
+        sdtDepth--;
+    }
+
+    @Override
+    public void startEditedSection(String editor, Date date, OOXMLWordAndPowerPointTextHandler.EditType editType) {
+        //no-op
+    }
+
+    @Override
+    public void endEditedSection() {
+        //no-op
+    }
+
+    @Override
+    public boolean getIncludeDeletedText() {
+        return includeDeletedText;
+    }
+
+    @Override
+    public void footnoteReference(String id) {
+        if (id != null) {
+            try {
+                xhtml.characters("[");
+                xhtml.characters(id);
+                xhtml.characters("]");
+            } catch (SAXException e) {
+
+            }
+        }
+    }
+
+    @Override
+    public void endnoteReference(String id) {
+        if (id != null) {
+            try {
+                xhtml.characters("[");
+                xhtml.characters(id);
+                xhtml.characters("]");
+            } catch (SAXException e) {
+
+            }
+        }
+    }
+
+    @Override
+    public boolean getIncludeMoveFromText() {
+        return includeMoveFromText;
+    }
+
+    @Override
+    public void embeddedOLERef(String relId) {
+        if (relId == null) {
+            return;
+        }
+        try {
+            AttributesImpl attributes = new AttributesImpl();
+            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+            attributes.addAttribute("", "id", "id", "CDATA", relId);
+            xhtml.startElement("div", attributes);
+            xhtml.endElement("div");
+
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void embeddedPicRef(String picFileName, String picDescription) {
+
+        try {
+            AttributesImpl attr = new AttributesImpl();
+            if (picFileName != null) {
+                attr.addAttribute("", "src", "src", "CDATA", "embedded:" + picFileName);
+            }
+            if (picDescription != null) {
+                attr.addAttribute("", "alt", "alt", "CDATA", picDescription);
+            }
+
+            xhtml.startElement("img", attr);
+            xhtml.endElement("img");
+
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void startBookmark(String id, String name) {
+        //skip bookmarks within hyperlinks
+        if (name != null && ! wroteHyperlinkStart) {
+            try {
+                xhtml.startElement("a", "name", name);
+                xhtml.endElement("a");
+            } catch (SAXException e) {
+
+            }
+        }
+    }
+
+    @Override
+    public void endBookmark(String id) {
+        //no-op
+    }
+
+    private void closeStyleTags() throws SAXException {
+        if (isItalics) {
+            xhtml.endElement("i");
+            isItalics = false;
+        }
+        if (isBold) {
+            xhtml.endElement("b");
+            isBold = false;
+        }
+    }
+
+    private void writeParagraphNumber(int numId, int ilvl,
+                                      XWPFListManager listManager,
+                                      XHTMLContentHandler xhtml) throws SAXException {
+
+        if (ilvl < 0 || numId < 0 || listManager == null) {
+            return;
+        }
+        String number = listManager.getFormattedNumber(BigInteger.valueOf(numId), ilvl);
+        if (number != null) {
+            xhtml.characters(number);
+        }
+
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
new file mode 100644
index 0000000..8cd84d9
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -0,0 +1,497 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml;
+
+
+import java.util.Date;
+import java.util.Map;
+
+import org.apache.tika.utils.DateUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This class is intended to handle anything that might contain IBodyElements:
+ * main document, headers, footers, notes, slides, etc.
+ *
+ * <p/>
+ *
+ * This class does not check for namespaces, and it can be applied
+ * to PPTX and DOCX for text extraction.
+ *
+ * <p/>
+ * This does not work with .xlsx or .vsdx.
+ *
+ * TODO: move this into POI?
+ *
+ */
+
+public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
+
+
+    public enum EditType {
+        NONE,
+        INSERT,
+        DELETE,
+        MOVE_TO,
+        MOVE_FROM
+    }
+
+    private final static String R = "r";
+    private final static String FLD = "fld";
+    private final static String RPR = "rPr";
+    private final static String P = "p";
+    private final static String P_STYLE = "pStyle";
+    private final static String PPR = "pPr";
+    private final static String T = "t";
+    private final static String TAB = "tab";
+    private final static String B = "b";
+    private final static String ILVL = "ilvl";
+    private final static String NUM_ID = "numId";
+    private final static String TC = "tc";
+    private final static String TR = "tr";
+    private final static String I = "i";
+    private final static String NUM_PR = "numPr";
+    private final static String BR = "br";
+    private final static String HYPERLINK = "hyperlink";
+    private final static String HLINK_CLICK = "hlinkClick"; //pptx hlink
+    private final static String TBL = "tbl";
+    private final static String PIC = "pic";
+    private final static String PICT = "pict";
+    private final static String IMAGEDATA = "imagedata";
+    private final static String BLIP = "blip";
+    private final static String CHOICE = "Choice";
+    private final static String FALLBACK = "Fallback";
+    private final static String OLE_OBJECT = "OLEObject";
+    private final static String CR = "cr";
+
+    public final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
+    private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
+    private final static String O_NS = "urn:schemas-microsoft-com:office:office";
+    private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
+    private final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
+    private final static String V_NS = "urn:schemas-microsoft-com:vml";
+
+    private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
+
+    private final static char[] TAB_CHAR = new char[]{'\t'};
+    private final static char NEWLINE = '\n';
+    
+    private final static String BOOKMARK_START = "bookmarkStart";
+    private final static String BOOKMARK_END = "bookmarkEnd";
+    private final static String FOOTNOTE_REFERENCE = "footnoteReference";
+    private final static String INS = "ins";
+    private final static String DEL = "del";
+    private final static String DEL_TEXT = "delText";
+    private final static String MOVE_FROM = "moveFrom";
+    private final static String MOVE_TO = "moveTo";
+    private final static String ENDNOTE_REFERENCE = "endnoteReference";
+
+    private final XWPFBodyContentsHandler bodyContentsHandler;
+
+    private final Map<String, String> linkedRelationships;
+
+    private boolean inR = false;//in run or in field
+    private boolean inT = false;
+    private boolean inRPr = false;
+    private boolean inNumPr = false;
+
+    private boolean inPic = false;
+    private boolean inPict = false;
+    private String picDescription = null;
+    private String picRId = null;
+    private String picFilename = null;
+
+    //mechanism used to determine when to
+    //signal the start of the p, and still
+    //handle p with pPr and those without
+    private boolean lastStartElementWasP = false;
+    //have we signaled the start of a p?
+    //pPr can happen multiple times within a p
+    //<p><pPr/><r><t>text</t></r><pPr></p>
+    private boolean pStarted = false;
+
+    //alternate content can be embedded in itself.
+    //need to track depth.
+    //if in alternate, choose fallback, maybe make this configurable?
+    private int inACChoiceDepth = 0;
+    private int inACFallbackDepth = 0;
+
+    private final RunProperties currRunProperties = new RunProperties();
+    private final ParagraphProperties currPProperties = new ParagraphProperties();
+
+    private final StringBuilder runBuffer = new StringBuilder();
+
+
+    private boolean inDelText = false;
+    private boolean inHlinkClick = false;
+
+    private OOXMLWordAndPowerPointTextHandler.EditType editType = OOXMLWordAndPowerPointTextHandler.EditType.NONE;
+
+
+    public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler,
+                                             Map<String, String> hyperlinks) {
+        this.bodyContentsHandler = bodyContentsHandler;
+        this.linkedRelationships = hyperlinks;
+    }
+
+
+    @Override
+    public void startDocument() throws SAXException {
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws SAXException {
+    }
+
+    @Override
+    public void endPrefixMapping(String prefix) throws SAXException {
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+        //TODO: checkBox, textBox, sym, headerReference, footerReference, commentRangeEnd
+
+        if (lastStartElementWasP && ! PPR.equals(localName)) {
+            bodyContentsHandler.startParagraph(currPProperties);
+        }
+
+        lastStartElementWasP = false;
+
+        if (uri != null && uri.equals(MC_NS)) {
+            if (CHOICE.equals(localName)) {
+                inACChoiceDepth++;
+            } else if (FALLBACK.equals(localName)) {
+                inACFallbackDepth++;
+            }
+        }
+
+        if (inACChoiceDepth > 0) {
+            return;
+        }
+        //these are sorted descending by frequency within docx files
+        //in our regression corpus.
+        //yes, I know, likely premature optimization...
+        if (RPR.equals(localName)) {
+            inRPr = true;
+        } else if (R.equals(localName)) {
+            inR = true;
+        } else if (T.equals(localName)) {
+            inT = true;
+        } else if (TAB.equals(localName)) {
+            runBuffer.append(TAB_CHAR);
+        } else if (P.equals(localName)) {
+            lastStartElementWasP = true;
+        } else if (B.equals(localName)) { //TODO: add bCs
+            if(inR && inRPr) {
+                currRunProperties.setBold(true);
+            }
+        } else if (TC.equals(localName)) {
+            bodyContentsHandler.startTableCell();
+        } else if (P_STYLE.equals(localName)) {
+            String styleId = atts.getValue(W_NS, "val");
+            currPProperties.setStyleID(styleId);
+        } else if (I.equals(localName)) { //TODO: add iCs
+            //rprs don't have to be inR; ignore those that aren't
+            if (inR && inRPr) {
+                currRunProperties.setItalics(true);
+            }
+        } else if (TR.equals(localName)) {
+            bodyContentsHandler.startTableRow();
+        } else if (NUM_PR.equals(localName)) {
+            inNumPr = true;
+        } else if (ILVL.equals(localName)) {
+            if (inNumPr) {
+                currPProperties.setIlvl(getIntVal(atts));
+            }
+        } else if (NUM_ID.equals(localName)) {
+            if (inNumPr) {
+                currPProperties.setNumId(getIntVal(atts));
+            }
+        } else if(BR.equals(localName)) {
+            runBuffer.append(NEWLINE);
+        } else if (BOOKMARK_START.equals(localName)) {
+            String name = atts.getValue(W_NS, "name");
+            String id = atts.getValue(W_NS, "id");
+            bodyContentsHandler.startBookmark(id, name);
+        } else if (BOOKMARK_END.equals(localName)) {
+            String id = atts.getValue(W_NS, "id");
+            bodyContentsHandler.endBookmark(id);
+        } else if (HYPERLINK.equals(localName)) { //docx hyperlink
+            String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+            String hyperlink = null;
+            if (hyperlinkId != null) {
+                hyperlink = linkedRelationships.get(hyperlinkId);
+                bodyContentsHandler.hyperlinkStart(hyperlink);
+            } else {
+                String anchor = atts.getValue(W_NS, "anchor");
+                if (anchor != null) {
+                    anchor = "#" + anchor;
+                }
+                bodyContentsHandler.hyperlinkStart(anchor);
+            }
+        } else if (HLINK_CLICK.equals(localName)) { //pptx hyperlink
+            String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+            String hyperlink = null;
+            if (hyperlinkId != null) {
+                hyperlink = linkedRelationships.get(hyperlinkId);
+                bodyContentsHandler.hyperlinkStart(hyperlink);
+                inHlinkClick = true;
+            }
+        } else if(TBL.equals(localName)) {
+            bodyContentsHandler.startTable();
+        } else if (BLIP.equals(localName)) { //check for DRAWING_NS
+            picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
+        } else if ("cNvPr".equals(localName)) { //check for PIC_NS?
+            picDescription = atts.getValue("", "descr");
+        } else if (PIC.equals(localName)) {
+            inPic = true; //check for PIC_NS?
+        } //TODO: add sdt, sdtPr, sdtContent goes here statistically
+        else if (FOOTNOTE_REFERENCE.equals(localName)) {
+            String id = atts.getValue(W_NS, "id");
+            bodyContentsHandler.footnoteReference(id);
+        } else if (IMAGEDATA.equals(localName)) {
+            picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+            picDescription = atts.getValue(O_NS, "title");
+        } else if (INS.equals(localName)) {
+            startEditedSection(editType.INSERT, atts);
+        } else if (DEL_TEXT.equals(localName)) {
+            inDelText = true;
+        } else if (DEL.equals(localName)) {
+            startEditedSection(editType.DELETE, atts);
+        } else if (MOVE_TO.equals(localName)) {
+            startEditedSection(EditType.MOVE_TO, atts);
+        } else if (MOVE_FROM.equals(localName)) {
+            startEditedSection(editType.MOVE_FROM, atts);
+        } else if (OLE_OBJECT.equals(localName)){ //check for O_NS?
+            String type = null;
+            String refId = null;
+            //TODO: clean this up and ...want to get ProgID?
+            for (int i = 0; i < atts.getLength(); i++) {
+                String attLocalName = atts.getLocalName(i);
+                String attValue = atts.getValue(i);
+                if (attLocalName.equals("Type")) {
+                    type = attValue;
+                } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) && attLocalName.equals("id")) {
+                    refId = attValue;
+                }
+            }
+            if ("Embed".equals(type)) {
+                bodyContentsHandler.embeddedOLERef(refId);
+            }
+        } else if(CR.equals(localName)) {
+            runBuffer.append(NEWLINE);
+        } else if (ENDNOTE_REFERENCE.equals(localName)) {
+            String id = atts.getValue(W_NS, "id");
+            bodyContentsHandler.endnoteReference(id);
+        }
+
+    }
+
+    private void startEditedSection(EditType editType, Attributes atts) {
+        String editAuthor = atts.getValue(W_NS, "author");
+        String editDateString = atts.getValue(W_NS, "date");
+        Date editDate = null;
+        if (editDateString != null) {
+            editDate = DateUtils.tryToParse(editDateString);
+        }
+        bodyContentsHandler.startEditedSection(editAuthor, editDate, editType);
+        this.editType = editType;
+    }
+
+    private int getIntVal(Attributes atts) {
+        String valString = atts.getValue(W_NS, "val");
+        if (valString != null) {
+            try {
+                return Integer.parseInt(valString);
+            } catch (NumberFormatException e) {
+                //swallow
+            }
+        }
+        return -1;
+    }
+
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+
+        if (CHOICE.equals(localName)) {
+            inACChoiceDepth--;
+        } else if (FALLBACK.equals(localName)) {
+            inACFallbackDepth--;
+        }
+        if (inACChoiceDepth > 0) {
+            return;
+        }
+
+        if (PIC.equals(localName)) { //PIC_NS
+            handlePict();
+            inPic = false;
+            return;
+        } else if (RPR.equals(localName)) {
+            inRPr = false;
+        } else if (R.equals(localName)) {
+            handleEndOfRun();
+        } else if (T.equals(localName)) {
+            inT = false;
+        } else if (PPR.equals(localName)) {
+            if (!pStarted) {
+                bodyContentsHandler.startParagraph(currPProperties);
+                pStarted = true;
+            }
+            currPProperties.reset();
+        } else if (P.equals(localName)) {
+            if (runBuffer.length() > 0) {
+                //<p><tab></p>...this will treat that as if it were
+                //a run...TODO: should we swallow whitespace that doesn't occur in a run?
+                bodyContentsHandler.run(currRunProperties, runBuffer.toString());
+                runBuffer.setLength(0);
+            }
+            pStarted = false;
+            bodyContentsHandler.endParagraph();
+        } else if (TC.equals(localName)) {
+            bodyContentsHandler.endTableCell();
+        } else if (TR.equals(localName)) {
+            bodyContentsHandler.endTableRow();
+        } else if (TBL.equals(localName)) {
+            bodyContentsHandler.endTable();
+        } else if (FLD.equals(localName)) {
+            handleEndOfRun();
+        } else if (DEL_TEXT.equals(localName)) {
+            inDelText = false;
+        } else if (INS.equals(localName) || DEL.equals(localName) ||
+                MOVE_TO.equals(localName) || MOVE_FROM.equals(localName)) {
+            editType = EditType.NONE;
+        } else if (HYPERLINK.equals(localName)) {
+            bodyContentsHandler.hyperlinkEnd();
+        } else if (PICT.equals(localName)) {
+            handlePict();
+        }
+    }
+
+    private void handleEndOfRun() {
+        bodyContentsHandler.run(currRunProperties, runBuffer.toString());
+        if (inHlinkClick) {
+            bodyContentsHandler.hyperlinkEnd();
+            inHlinkClick = false;
+        }
+        inR = false;
+        runBuffer.setLength(0);
+        currRunProperties.setBold(false);
+        currRunProperties.setItalics(false);
+    }
+
+    private void handlePict() {
+        String picFileName = null;
+        if (picRId != null) {
+            picFileName = linkedRelationships.get(picRId);
+        }
+        bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
+        picDescription = null;
+        picRId = null;
+        inPic = false;
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+
+        if (inACChoiceDepth > 0) {
+            return;
+        }
+        if (editType.equals(EditType.MOVE_FROM) && inT) {
+            if (bodyContentsHandler.getIncludeMoveFromText()) {
+                runBuffer.append(ch, start, length);
+            }
+        } else if (inT) {
+            runBuffer.append(ch, start, length);
+        } else if (bodyContentsHandler.getIncludeDeletedText() && editType.equals(EditType.DELETE)) {
+            runBuffer.append(ch, start, length);
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+        if (inACChoiceDepth > 0) {
+            return;
+        }
+
+        if (inT) {
+            runBuffer.append(ch, start, length);
+        } else if (bodyContentsHandler.getIncludeDeletedText() && inDelText) {
+            runBuffer.append(ch, start, length);
+        }
+    }
+
+
+    public interface XWPFBodyContentsHandler {
+
+        void run(RunProperties runProperties, String contents);
+
+        /**
+         * @param link the link; can be null
+         */
+        void hyperlinkStart(String link);
+
+        void hyperlinkEnd();
+
+        void startParagraph(ParagraphProperties paragraphProperties);
+
+        void endParagraph();
+
+        void startTable();
+
+        void endTable();
+
+        void startTableRow();
+
+        void endTableRow();
+
+        void startTableCell();
+
+        void endTableCell();
+
+        void startSDT();
+
+        void endSDT();
+
+        void startEditedSection(String editor, Date date, EditType editType);
+
+        void endEditedSection();
+
+        boolean getIncludeDeletedText();
+
+        void footnoteReference(String id);
+
+        void endnoteReference(String id);
+
+        boolean getIncludeMoveFromText();
+
+        void embeddedOLERef(String refId);
+
+        void embeddedPicRef(String picFileName, String picDescription);
+
+        void startBookmark(String id, String name);
+
+        void endBookmark(String id);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index 21577c4..a7de780 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -36,9 +36,7 @@ import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
 import org.apache.poi.xslf.usermodel.XSLFRelation;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFDocumentXMLBodyHandler;
 import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
-import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFTikaBodyPartHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.OfflineContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -110,14 +108,14 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
         handleBasicRelatedParts(XSLFRelation.SLIDE_MASTER.getRelation(),
                 "slide-master",
                 mainDocument,
-                new PlaceHolderSkipper(new XSLFDocumentXMLBodyHandler(
-                        new XSLFTikaBodyPartHandler(xhtml), new HashMap<String, String>())));
+                new PlaceHolderSkipper(new OOXMLWordAndPowerPointTextHandler(
+                        new OOXMLTikaBodyPartHandler(xhtml), new HashMap<String, String>())));
 
         handleBasicRelatedParts(HANDOUT_MASTER,
                 "slide-handout-master",
                 mainDocument,
-                new XSLFDocumentXMLBodyHandler(
-                        new XSLFTikaBodyPartHandler(xhtml), new HashMap<String, String>())
+                new OOXMLWordAndPowerPointTextHandler(
+                        new OOXMLTikaBodyPartHandler(xhtml), new HashMap<String, String>())
         );
     }
 
@@ -162,8 +160,8 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
             context.getSAXParser().parse(
                     new CloseShieldInputStream(stream),
                     new OfflineContentHandler(new EmbeddedContentHandler(
-                            new XSLFDocumentXMLBodyHandler(
-                                    new XSLFTikaBodyPartHandler(xhtml), linkedRelationships))));
+                            new OOXMLWordAndPowerPointTextHandler(
+                                    new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships))));
 
         } catch (TikaException e) {
             //do something with this
@@ -174,19 +172,19 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
 
         handleBasicRelatedParts(XSLFRelation.SLIDE_LAYOUT.getRelation(),
                 "slide-master-content", slidePart,
-                new PlaceHolderSkipper(new XSLFDocumentXMLBodyHandler(
-                        new XSLFTikaBodyPartHandler(xhtml), linkedRelationships))
+                new PlaceHolderSkipper(new OOXMLWordAndPowerPointTextHandler(
+                        new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships))
                 );
 
         handleBasicRelatedParts(XSLFRelation.NOTES.getRelation(),
                 "slide-notes", slidePart,
-                new XSLFDocumentXMLBodyHandler(
-                        new XSLFTikaBodyPartHandler(xhtml), linkedRelationships));
+                new OOXMLWordAndPowerPointTextHandler(
+                        new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));
 
         handleBasicRelatedParts(XSLFRelation.NOTES_MASTER.getRelation(),
                 "slide-notes-master", slidePart,
-                new XSLFDocumentXMLBodyHandler(
-                        new XSLFTikaBodyPartHandler(xhtml), linkedRelationships));
+                new OOXMLWordAndPowerPointTextHandler(
+                        new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));
 
         handleBasicRelatedParts(XSLFRelation.COMMENTS.getRelation(),
                 null, slidePart,
@@ -387,9 +385,9 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
 
     private static class PlaceHolderSkipper extends DefaultHandler {
 
-        private final XSLFDocumentXMLBodyHandler wrappedHandler;
+        private final ContentHandler wrappedHandler;
 
-        PlaceHolderSkipper(XSLFDocumentXMLBodyHandler wrappedHandler) {
+        PlaceHolderSkipper(ContentHandler wrappedHandler) {
             this.wrappedHandler = wrappedHandler;
         }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index d60b274..8f9fbf5 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -34,11 +34,9 @@ import org.apache.poi.xwpf.usermodel.XWPFRelation;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
-import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFDocumentXMLBodyHandler;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFNumberingShim;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim;
-import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFTikaBodyPartHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.OfflineContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -166,8 +164,8 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
             context.getSAXParser().parse(
                     new CloseShieldInputStream(stream),
                     new OfflineContentHandler(new EmbeddedContentHandler(
-                            new XWPFDocumentXMLBodyHandler(
-                                    new XWPFTikaBodyPartHandler(xhtml, styles, listManager,
+                            new OOXMLWordAndPowerPointTextHandler(
+                                    new OOXMLTikaBodyPartHandler(xhtml, styles, listManager,
                                             context.get(OfficeParserConfig.class)), linkedRelationships))));
         } catch (TikaException e) {
             //swallow

http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
index 2a99126..c8bcdc7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
@@ -31,6 +31,12 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumLvl;
 
 
 public class XWPFListManager extends AbstractListManager {
+
+    /**
+     * Empty singleton to be used when there is no list manager.
+     * Always returns empty string.
+     */
+    public final static XWPFListManager EMPTY_LIST = new EmptyListManager();
     private final static boolean OVERRIDE_AVAILABLE;
     private final static String SKIP_FORMAT = Character.toString((char) 61623);//if this shows up as the lvlText, don't show a number
 
@@ -175,4 +181,21 @@ public class XWPFListManager extends AbstractListManager {
         return new LevelTuple(start, restart, lvlText, numFmt, isLegal);
     }
 
+
+    private static class EmptyListManager extends XWPFListManager {
+        EmptyListManager() {
+            super(null);
+        }
+
+        @Override
+        public String getFormattedNumber(XWPFParagraph paragraph) {
+            return "";
+        }
+
+        @Override
+        public String getFormattedNumber(BigInteger numId, int iLvl) {
+            return "";
+        }
+
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFDocumentXMLBodyHandler.java
deleted file mode 100644
index b5aa449..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFDocumentXMLBodyHandler.java
+++ /dev/null
@@ -1,330 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xslf;
-
-
-import java.util.Map;
-
-import org.apache.tika.parser.microsoft.ooxml.AbstractDocumentXMLBodyHandler;
-import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
-import org.apache.tika.parser.microsoft.ooxml.RunProperties;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-
-/**
- * This class is intended to handle anything that might contain IBodyElements:
- * main document, headers, footers, notes, etc.
- */
-
-public class XSLFDocumentXMLBodyHandler extends AbstractDocumentXMLBodyHandler {
-
-
-    private final XSLFBodyContentsHandler bodyContentsHandler;
-    //private final RelationshipsManager relationshipsManager;
-
-
-    //alternate content can be embedded in itself.
-    //need to track depth.
-    //if in alternate, choose fallback, maybe make this configurable?
-    private int inACChoiceDepth = 0;
-    private int inACFallbackDepth = 0;
-
-    private boolean inHyperlink = false;
-
-    private final Map<String, String> linkedRelationships;
-
-    public XSLFDocumentXMLBodyHandler(XSLFBodyContentsHandler bodyContentsHandler,
-                                      Map<String, String> linkedRelationships) {
-        this.bodyContentsHandler = bodyContentsHandler;
-        this.linkedRelationships = linkedRelationships;
-    }
-
-
-    @Override
-    public void startDocument() throws SAXException {
-    }
-
-    @Override
-    public void endDocument() throws SAXException {
-    }
-
-    @Override
-    public void startPrefixMapping(String prefix, String uri) throws SAXException {
-    }
-
-    @Override
-    public void endPrefixMapping(String prefix) throws SAXException {
-    }
-
-    @Override
-    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
-        //TODO: checkBox, textBox, sym, headerReference, footerReference, commentRangeEnd
-
-        if (lastStartElementWasP && ! PPR.equals(localName)) {
-            bodyContentsHandler.startParagraph(currPProperties);
-            pStarted = true;
-        }
-
-        lastStartElementWasP = false;
-
-        if (uri != null && uri.equals(MC_NS)) {
-            if (CHOICE.equals(localName)) {
-                inACChoiceDepth++;
-            } else if (FALLBACK.equals(localName)) {
-                inACFallbackDepth++;
-            }
-        }
-
-        if (inACChoiceDepth > 0) {
-            return;
-        }
-        //these are sorted descending by frequency
-        //in our regression corpus
-        if (RPR.equals(localName)) {
-            inRPr = true;
-        } else if (R.equals(localName)) {
-            inR = true;
-        } else if (T.equals(localName)) {
-            inT = true;
-        } else if (TAB.equals(localName)) {
-            runBuffer.append(TAB_CHAR);
-        } else if (P.equals(localName)) {
-            lastStartElementWasP = true;
-        } else if (B.equals(localName)) { //TODO: add bCs
-            if(inR && inRPr) {
-                currRunProperties.setBold(true);
-            }
-        } else if (TC.equals(localName)) {
-            bodyContentsHandler.startTableCell();
-        } else if (P_STYLE.equals(localName)) {
-            String styleId = atts.getValue(W_NS, "val");
-            currPProperties.setStyleID(styleId);
-        } else if (I.equals(localName)) { //TODO: add iCs
-            //rprs don't have to be inR; ignore those that aren't
-            if (inR && inRPr) {
-                currRunProperties.setItalics(true);
-            }
-        } else if (FLD.equals(localName)) {
-            inR = true;
-        } else if (TR.equals(localName)) {
-            bodyContentsHandler.startTableRow();
-        } else if (NUM_PR.equals(localName)) {
-            inNumPr = true;
-        } else if (ILVL.equals(localName)) {
-            if (inNumPr) {
-                currPProperties.setIlvl(getIntVal(atts));
-            }
-        } else if (NUM_ID.equals(localName)) {
-            if (inNumPr) {
-                currPProperties.setNumId(getIntVal(atts));
-            }
-        } else if(BR.equals(localName)) {
-            runBuffer.append(NEWLINE);
-        } else if ("hlinkClick".equals(localName)) {
-            String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
-            String hyperlink = null;
-            if (hyperlinkId != null) {
-                hyperlink = linkedRelationships.get(hyperlinkId);
-                bodyContentsHandler.hyperlinkStart(hyperlink);
-                inHyperlink = true;
-            }/* else {
-                String anchor = atts.getValue(W_NS, "anchor");
-                if (anchor != null) {
-                    anchor = "#" + anchor;
-                }
-                bodyContentsHandler.hyperlinkStart(anchor);
-                inHyperlink = true;
-            }*/
-        } else if(TBL.equals(localName)) {
-            bodyContentsHandler.startTable();
-        } else if (BLIP.equals(localName)) { //check for DRAWING_NS
-            picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
-        } else if ("cNvPr".equals(localName)) { //check for PIC_NS?
-            picDescription = atts.getValue("", "descr");
-        } else if (PIC.equals(localName)) {
-            inPic = true; //check for PIC_NS?
-        } else if (IMAGEDATA.equals(localName)) {
-            picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
-            picDescription = atts.getValue(O_NS, "title");
-        } else if (OLE_OBJECT.equals(localName)){ //check for O_NS?
-            String type = null;
-            String refId = null;
-            //TODO: clean this up and ...want to get ProgID?
-            for (int i = 0; i < atts.getLength(); i++) {
-                String attLocalName = atts.getLocalName(i);
-                String attValue = atts.getValue(i);
-                if (attLocalName.equals("Type")) {
-                    type = attValue;
-                } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) && attLocalName.equals("id")) {
-                    refId = attValue;
-                }
-            }
-            if ("Embed".equals(type)) {
-                bodyContentsHandler.embeddedOLERef(refId);
-            }
-        } else if(CR.equals(localName)) {
-            runBuffer.append(NEWLINE);
-        }
-
-    }
-
-
-    private int getIntVal(Attributes atts) {
-        String valString = atts.getValue(W_NS, "val");
-        if (valString != null) {
-            try {
-                return Integer.parseInt(valString);
-            } catch (NumberFormatException e) {
-                //swallow
-            }
-        }
-        return -1;
-    }
-
-
-    @Override
-    public void endElement(String uri, String localName, String qName) throws SAXException {
-
-        if (CHOICE.equals(localName)) {
-            inACChoiceDepth--;
-        } else if (FALLBACK.equals(localName)) {
-            inACFallbackDepth--;
-        }
-        if (inACChoiceDepth > 0) {
-            return;
-        }
-
-        if (PIC.equals(localName)) { //PIC_NS
-            handlePict();
-            inPic = false;
-            return;
-        } else if (RPR.equals(localName)) {
-            inRPr = false;
-        } else if (R.equals(localName)) {
-            handleEndOfRun();
-        } else if (T.equals(localName)) {
-            inT = false;
-        } else if (PPR.equals(localName)) {
-            if (!pStarted) {
-                bodyContentsHandler.startParagraph(currPProperties);
-                pStarted = true;
-            }
-            currPProperties.reset();
-        } else if (P.equals(localName)) {
-            if (runBuffer.length() > 0) {
-                //<p><tab></p>...this will treat that as if it were
-                //a run...TODO: should we swallow whitespace that doesn't occur in a run?
-                bodyContentsHandler.run(currRunProperties, runBuffer.toString());
-                runBuffer.setLength(0);
-            }
-            pStarted = false;
-            bodyContentsHandler.endParagraph();
-        } else if (TC.equals(localName)) {
-            bodyContentsHandler.endTableCell();
-        } else if (TR.equals(localName)) {
-            bodyContentsHandler.endTableRow();
-        } else if (TBL.equals(localName)) {
-            bodyContentsHandler.endTable();
-        } else if (FLD.equals(localName)) {
-            handleEndOfRun();
-        } else if (HYPERLINK.equals(localName)) {
-            bodyContentsHandler.hyperlinkEnd();
-        } else if (PICT.equals(localName)) {
-            handlePict();
-        }
-    }
-
-    private void handleEndOfRun() {
-        bodyContentsHandler.run(currRunProperties, runBuffer.toString());
-        if (inHyperlink) {
-            bodyContentsHandler.hyperlinkEnd();
-            inHyperlink = false;
-        }
-        inR = false;
-        runBuffer.setLength(0);
-        currRunProperties.setBold(false);
-        currRunProperties.setItalics(false);
-    }
-
-    private void handlePict() {
-        String picFileName = null;
-        if (picRId != null) {
-            picFileName = "picId";//TODO: linkedRelationships.get(picRId);
-        }
-        bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
-        picDescription = null;
-        picRId = null;
-        inPic = false;
-    }
-
-    @Override
-    public void characters(char[] ch, int start, int length) throws SAXException {
-
-        if (inACChoiceDepth > 0) {
-            return;
-        }
-         if (inT) {
-            runBuffer.append(ch, start, length);
-        }
-    }
-
-    @Override
-    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
-        if (inACChoiceDepth > 0) {
-            return;
-        }
-
-        if (inT) {
-            runBuffer.append(ch, start, length);
-        }
-    }
-
-
-    public interface XSLFBodyContentsHandler {
-
-        void run(RunProperties runProperties, String contents);
-
-        /**
-         * @param link the link; can be null
-         */
-        void hyperlinkStart(String link);
-
-        void hyperlinkEnd();
-
-        void startParagraph(ParagraphProperties paragraphProperties);
-
-        void endParagraph();
-
-        void startTable();
-
-        void endTable();
-
-        void startTableRow();
-
-        void endTableRow();
-
-        void startTableCell();
-
-        void endTableCell();
-
-        void embeddedOLERef(String refId);
-
-        void embeddedPicRef(String picFileName, String picDescription);
-
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
index 15bbd6a..3e98203 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
@@ -18,12 +18,14 @@
 package org.apache.tika.parser.microsoft.ooxml.xslf;
 
 import java.io.IOException;
+import java.util.Date;
 
 import org.apache.poi.POIXMLDocument;
 import org.apache.poi.POIXMLProperties;
 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
 import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
 import org.apache.tika.parser.microsoft.ooxml.RunProperties;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
@@ -83,7 +85,7 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
 
 
 
-    private class XSLFToTextContentHandler implements XSLFDocumentXMLBodyHandler.XSLFBodyContentsHandler {
+    private class XSLFToTextContentHandler implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
         private final StringBuilder buffer;
 
         public XSLFToTextContentHandler(StringBuilder buffer) {
@@ -145,6 +147,45 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
             buffer.append("\t");
         }
 
+        @Override
+        public void startSDT() {
+
+        }
+
+        @Override
+        public void endSDT() {
+
+        }
+
+        @Override
+        public void startEditedSection(String editor, Date date, OOXMLWordAndPowerPointTextHandler.EditType editType) {
+
+        }
+
+        @Override
+        public void endEditedSection() {
+
+        }
+
+        @Override
+        public boolean getIncludeDeletedText() {
+            return false;
+        }
+
+        @Override
+        public void footnoteReference(String id) {
+
+        }
+
+        @Override
+        public void endnoteReference(String id) {
+
+        }
+
+        @Override
+        public boolean getIncludeMoveFromText() {
+            return false;
+        }
 
 
         @Override
@@ -157,5 +198,15 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
             //no-op
         }
 
+        @Override
+        public void startBookmark(String id, String name) {
+
+        }
+
+        @Override
+        public void endBookmark(String id) {
+
+        }
+
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFTikaBodyPartHandler.java
deleted file mode 100644
index ff587f7..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFTikaBodyPartHandler.java
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xslf;
-
-
-import java.math.BigInteger;
-
-import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
-import org.apache.tika.parser.microsoft.ooxml.RunProperties;
-import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-public class XSLFTikaBodyPartHandler implements XSLFDocumentXMLBodyHandler.XSLFBodyContentsHandler {
-
-    private final static String P = "p";
-
-    private final static char[] NEWLINE = new char[]{'\n'};
-    private final static char[] TAB = new char[]{'\t'};
-
-    private final XHTMLContentHandler xhtml;
-
-    private int pDepth = 0; //paragraph depth
-    private int tableDepth = 0;//table depth
-    private int pWithinCell = 0;//paragraph count within a cell
-    private boolean isItalics = false;
-    private boolean isBold = false;
-    private boolean wroteHyperlinkStart = false;
-    private boolean inTableCell = false;
-
-    public XSLFTikaBodyPartHandler(XHTMLContentHandler xhtml) {
-        this.xhtml = xhtml;
-    }
-
-    @Override
-    public void run(RunProperties runProperties, String contents) {
-        try {
-            // True if we are currently in the named style tag:
-            if (runProperties.getBold() != isBold) {
-                if (isItalics) {
-                    xhtml.endElement("i");
-                    isItalics = false;
-                }
-                if (runProperties.getBold()) {
-                    xhtml.startElement("b");
-                    isBold = true;
-                } else {
-                    xhtml.endElement("b");
-                    isBold = false;
-                }
-            }
-
-            if (runProperties.getItalics() != isItalics) {
-                if (runProperties.getItalics()) {
-                    xhtml.startElement("i");
-                    isItalics = true;
-                } else {
-                    xhtml.endElement("i");
-                    isItalics = false;
-                }
-            }
-
-            xhtml.characters(contents);
-
-        } catch (SAXException e) {
-
-        }
-    }
-
-    @Override
-    public void hyperlinkStart(String link) {
-        try {
-            if (link != null) {
-                xhtml.startElement("a", "href", link);
-                wroteHyperlinkStart = true;
-            }
-        } catch (SAXException e) {
-
-        }
-    }
-
-    @Override
-    public void hyperlinkEnd() {
-        try {
-            if (wroteHyperlinkStart) {
-                closeStyleTags();
-                wroteHyperlinkStart = false;
-                xhtml.endElement("a");
-            }
-        } catch (SAXException e) {
-
-        }
-    }
-
-    @Override
-    public void startParagraph(ParagraphProperties paragraphProperties) {
-        if (pDepth == 0 && tableDepth == 0) {
-            try {
-                xhtml.startElement(P);
-            } catch (SAXException e) {
-
-            }
-        }
-        pDepth++;
-    }
-
-    @Override
-    public void endParagraph() {
-        try {
-            closeStyleTags();
-            if (pDepth == 1 && tableDepth == 0) {
-                xhtml.endElement(P);
-            } else if (pWithinCell > 0){
-                xhtml.characters(NEWLINE, 0, 1);
-            }
-        } catch (SAXException e) {
-
-        }
-        if (inTableCell) {
-            pWithinCell++;
-        }
-        pDepth--;
-    }
-
-    @Override
-    public void startTable() {
-        try {
-            xhtml.startElement("table");
-            tableDepth++;
-        } catch (SAXException e) {
-
-        }
-    }
-
-    @Override
-    public void endTable() {
-        try {
-            xhtml.endElement("table");
-            tableDepth--;
-        } catch (SAXException e) {
-
-        }
-    }
-
-    @Override
-    public void startTableRow() {
-        try {
-            xhtml.startElement("tr");
-        } catch (SAXException e) {
-
-        }
-    }
-
-    @Override
-    public void endTableRow() {
-        try {
-            xhtml.endElement("tr");
-        } catch (SAXException e) {
-
-        }
-    }
-
-    @Override
-    public void startTableCell() {
-        try {
-            xhtml.startElement("td");
-        } catch (SAXException e) {
-
-        }
-        inTableCell = true;
-    }
-
-    @Override
-    public void endTableCell() {
-        try {
-            xhtml.endElement("td");
-        } catch (SAXException e) {
-
-        }
-        inTableCell = false;
-        pWithinCell = 0;
-    }
-
-
-    @Override
-    public void embeddedOLERef(String relId) {
-        if (relId == null) {
-            return;
-        }
-        try {
-            AttributesImpl attributes = new AttributesImpl();
-            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
-            attributes.addAttribute("", "id", "id", "CDATA", relId);
-            xhtml.startElement("div", attributes);
-            xhtml.endElement("div");
-
-        } catch (SAXException e) {
-
-        }
-    }
-
-    @Override
-    public void embeddedPicRef(String picFileName, String picDescription) {
-
-        try {
-            AttributesImpl attr = new AttributesImpl();
-            if (picFileName != null) {
-                attr.addAttribute("", "src", "src", "CDATA", "embedded:" + picFileName);
-            }
-            if (picDescription != null) {
-                attr.addAttribute("", "alt", "alt", "CDATA", picDescription);
-            }
-
-            xhtml.startElement("img", attr);
-            xhtml.endElement("img");
-
-        } catch (SAXException e) {
-
-        }
-    }
-
-    private void closeStyleTags() throws SAXException {
-        if (isItalics) {
-            xhtml.endElement("i");
-            isItalics = false;
-        }
-        if (isBold) {
-            xhtml.endElement("b");
-            isBold = false;
-        }
-    }
-
-    private void writeParagraphNumber(int numId, int ilvl,
-                                      XWPFListManager listManager,
-                                      XHTMLContentHandler xhtml) throws SAXException {
-
-        if (ilvl < 0 || numId < 0 || listManager == null) {
-            return;
-        }
-        String number = listManager.getFormattedNumber(BigInteger.valueOf(numId), ilvl);
-        if (number != null) {
-            xhtml.characters(number);
-        }
-
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
deleted file mode 100644
index d08fb07..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
+++ /dev/null
@@ -1,388 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import java.util.Date;
-import java.util.Map;
-
-import org.apache.tika.parser.microsoft.ooxml.AbstractDocumentXMLBodyHandler;
-import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
-import org.apache.tika.parser.microsoft.ooxml.RunProperties;
-import org.apache.tika.utils.DateUtils;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-
-/**
- * This class is intended to handle anything that might contain IBodyElements:
- * main document, headers, footers, notes, etc.
- */
-
-public class XWPFDocumentXMLBodyHandler extends AbstractDocumentXMLBodyHandler {
-
-
-    enum EditType {
-        NONE,
-        INSERT,
-        DELETE,
-        MOVE_TO,
-        MOVE_FROM
-    }
-
-
-    private final static String BOOKMARK_START = "bookmarkStart";
-    private final static String BOOKMARK_END = "bookmarkEnd";
-    private final static String FOOTNOTE_REFERENCE = "footnoteReference";
-    private final static String INS = "ins";
-    private final static String DEL = "del";
-    private final static String DEL_TEXT = "delText";
-    private final static String MOVE_FROM = "moveFrom";
-    private final static String MOVE_TO = "moveTo";
-    private final static String ENDNOTE_REFERENCE = "endnoteReference";
-
-    private final XWPFBodyContentsHandler bodyContentsHandler;
-    //private final RelationshipsManager relationshipsManager;
-    private final Map<String, String> linkedRelationships;
-
-    private boolean inDelText = false;
-
-    private XWPFDocumentXMLBodyHandler.EditType editType = XWPFDocumentXMLBodyHandler.EditType.NONE;
-
-
-    public XWPFDocumentXMLBodyHandler(XWPFBodyContentsHandler bodyContentsHandler,
-                                      Map<String, String> hyperlinks) {
-        this.bodyContentsHandler = bodyContentsHandler;
-        this.linkedRelationships = hyperlinks;
-    }
-
-
-    @Override
-    public void startDocument() throws SAXException {
-    }
-
-    @Override
-    public void endDocument() throws SAXException {
-    }
-
-    @Override
-    public void startPrefixMapping(String prefix, String uri) throws SAXException {
-    }
-
-    @Override
-    public void endPrefixMapping(String prefix) throws SAXException {
-    }
-
-    @Override
-    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
-        //TODO: checkBox, textBox, sym, headerReference, footerReference, commentRangeEnd
-
-        if (lastStartElementWasP && ! PPR.equals(localName)) {
-            bodyContentsHandler.startParagraph(currPProperties);
-        }
-
-        lastStartElementWasP = false;
-
-        if (uri != null && uri.equals(MC_NS)) {
-            if (CHOICE.equals(localName)) {
-                inACChoiceDepth++;
-            } else if (FALLBACK.equals(localName)) {
-                inACFallbackDepth++;
-            }
-        }
-
-        if (inACChoiceDepth > 0) {
-            return;
-        }
-        //these are sorted descending by frequency
-        //in our regression corpus
-        if (RPR.equals(localName)) {
-            inRPr = true;
-        } else if (R.equals(localName)) {
-            inR = true;
-        } else if (T.equals(localName)) {
-            inT = true;
-        } else if (TAB.equals(localName)) {
-            runBuffer.append(TAB_CHAR);
-        } else if (P.equals(localName)) {
-            lastStartElementWasP = true;
-        } else if (B.equals(localName)) { //TODO: add bCs
-            if(inR && inRPr) {
-                currRunProperties.setBold(true);
-            }
-        } else if (TC.equals(localName)) {
-            bodyContentsHandler.startTableCell();
-        } else if (P_STYLE.equals(localName)) {
-            String styleId = atts.getValue(W_NS, "val");
-            currPProperties.setStyleID(styleId);
-        } else if (I.equals(localName)) { //TODO: add iCs
-            //rprs don't have to be inR; ignore those that aren't
-            if (inR && inRPr) {
-                currRunProperties.setItalics(true);
-            }
-        } else if (TR.equals(localName)) {
-            bodyContentsHandler.startTableRow();
-        } else if (NUM_PR.equals(localName)) {
-            inNumPr = true;
-        } else if (ILVL.equals(localName)) {
-            if (inNumPr) {
-                currPProperties.setIlvl(getIntVal(atts));
-            }
-        } else if (NUM_ID.equals(localName)) {
-            if (inNumPr) {
-                currPProperties.setNumId(getIntVal(atts));
-            }
-        } else if(BR.equals(localName)) {
-            runBuffer.append(NEWLINE);
-        } else if (BOOKMARK_START.equals(localName)) {
-            String name = atts.getValue(W_NS, "name");
-            String id = atts.getValue(W_NS, "id");
-            bodyContentsHandler.startBookmark(id, name);
-        } else if (BOOKMARK_END.equals(localName)) {
-            String id = atts.getValue(W_NS, "id");
-            bodyContentsHandler.endBookmark(id);
-        } else if (HYPERLINK.equals(localName)) {
-            String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
-            String hyperlink = null;
-            if (hyperlinkId != null) {
-                hyperlink = linkedRelationships.get(hyperlinkId);
-                bodyContentsHandler.hyperlinkStart(hyperlink);
-            } else {
-                String anchor = atts.getValue(W_NS, "anchor");
-                if (anchor != null) {
-                    anchor = "#" + anchor;
-                }
-                bodyContentsHandler.hyperlinkStart(anchor);
-            }
-        } else if(TBL.equals(localName)) {
-            bodyContentsHandler.startTable();
-        } else if (BLIP.equals(localName)) { //check for DRAWING_NS
-            picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
-        } else if ("cNvPr".equals(localName)) { //check for PIC_NS?
-            picDescription = atts.getValue("", "descr");
-        } else if (PIC.equals(localName)) {
-            inPic = true; //check for PIC_NS?
-        } //TODO: add sdt, sdtPr, sdtContent goes here statistically
-        else if (FOOTNOTE_REFERENCE.equals(localName)) {
-            String id = atts.getValue(W_NS, "id");
-            bodyContentsHandler.footnoteReference(id);
-        } else if (IMAGEDATA.equals(localName)) {
-            picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
-            picDescription = atts.getValue(O_NS, "title");
-        } else if (INS.equals(localName)) {
-            startEditedSection(editType.INSERT, atts);
-        } else if (DEL_TEXT.equals(localName)) {
-            inDelText = true;
-        } else if (DEL.equals(localName)) {
-            startEditedSection(editType.DELETE, atts);
-        } else if (MOVE_TO.equals(localName)) {
-            startEditedSection(EditType.MOVE_TO, atts);
-        } else if (MOVE_FROM.equals(localName)) {
-            startEditedSection(editType.MOVE_FROM, atts);
-        } else if (OLE_OBJECT.equals(localName)){ //check for O_NS?
-            String type = null;
-            String refId = null;
-            //TODO: clean this up and ...want to get ProgID?
-            for (int i = 0; i < atts.getLength(); i++) {
-                String attLocalName = atts.getLocalName(i);
-                String attValue = atts.getValue(i);
-                if (attLocalName.equals("Type")) {
-                    type = attValue;
-                } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) && attLocalName.equals("id")) {
-                    refId = attValue;
-                }
-            }
-            if ("Embed".equals(type)) {
-                bodyContentsHandler.embeddedOLERef(refId);
-            }
-        } else if(CR.equals(localName)) {
-            runBuffer.append(NEWLINE);
-        } else if (ENDNOTE_REFERENCE.equals(localName)) {
-            String id = atts.getValue(W_NS, "id");
-            bodyContentsHandler.endnoteReference(id);
-        }
-
-    }
-
-    private void startEditedSection(EditType editType, Attributes atts) {
-        String editAuthor = atts.getValue(W_NS, "author");
-        String editDateString = atts.getValue(W_NS, "date");
-        Date editDate = null;
-        if (editDateString != null) {
-            editDate = DateUtils.tryToParse(editDateString);
-        }
-        bodyContentsHandler.startEditedSection(editAuthor, editDate, editType);
-        this.editType = editType;
-    }
-
-    private int getIntVal(Attributes atts) {
-        String valString = atts.getValue(W_NS, "val");
-        if (valString != null) {
-            try {
-                return Integer.parseInt(valString);
-            } catch (NumberFormatException e) {
-                //swallow
-            }
-        }
-        return -1;
-    }
-
-
-    @Override
-    public void endElement(String uri, String localName, String qName) throws SAXException {
-
-        if (CHOICE.equals(localName)) {
-            inACChoiceDepth--;
-        } else if (FALLBACK.equals(localName)) {
-            inACFallbackDepth--;
-        }
-        if (inACChoiceDepth > 0) {
-            return;
-        }
-
-        if (PIC.equals(localName)) { //PIC_NS
-            handlePict();
-            inPic = false;
-            return;
-        } else if (RPR.equals(localName)) {
-            inRPr = false;
-        } else if (R.equals(localName)) {
-            bodyContentsHandler.run(currRunProperties, runBuffer.toString());
-            inR = false;
-            runBuffer.setLength(0);
-            currRunProperties.setBold(false);
-            currRunProperties.setItalics(false);
-        } else if (T.equals(localName)) {
-            inT = false;
-        } else if (PPR.equals(localName)) {
-            bodyContentsHandler.startParagraph(currPProperties);
-            currPProperties.reset();
-        } else if (P.equals(localName)) {
-            bodyContentsHandler.endParagraph();
-        } else if (TC.equals(localName)) {
-            bodyContentsHandler.endTableCell();
-        } else if (TR.equals(localName)) {
-            bodyContentsHandler.endTableRow();
-        } else if (TBL.equals(localName)) {
-            bodyContentsHandler.endTable();
-        } else if (HYPERLINK.equals(localName)) {
-            bodyContentsHandler.hyperlinkEnd();
-        } else if (DEL_TEXT.equals(localName)) {
-            inDelText = false;
-        } else if (INS.equals(localName) || DEL.equals(localName) ||
-                MOVE_TO.equals(localName) || MOVE_FROM.equals(localName)) {
-            editType = EditType.NONE;
-        } else if (PICT.equals(localName)) {
-            handlePict();
-
-        }
-    }
-
-    private void handlePict() {
-        String picFileName = null;
-        if (picRId != null) {
-            picFileName = linkedRelationships.get(picRId);
-        }
-        bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
-        picDescription = null;
-        picRId = null;
-        inPic = false;
-    }
-
-    @Override
-    public void characters(char[] ch, int start, int length) throws SAXException {
-
-        if (inACChoiceDepth > 0) {
-            return;
-        }
-        if (editType.equals(EditType.MOVE_FROM) && inT) {
-            if (bodyContentsHandler.getIncludeMoveFromText()) {
-                runBuffer.append(ch, start, length);
-            }
-        } else if (inT) {
-            runBuffer.append(ch, start, length);
-        } else if (bodyContentsHandler.getIncludeDeletedText() && editType.equals(EditType.DELETE)) {
-            runBuffer.append(ch, start, length);
-        }
-    }
-
-    @Override
-    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
-        if (inACChoiceDepth > 0) {
-            return;
-        }
-
-        if (inT) {
-            runBuffer.append(ch, start, length);
-        } else if (bodyContentsHandler.getIncludeDeletedText() && inDelText) {
-            runBuffer.append(ch, start, length);
-        }
-    }
-
-
-    public interface XWPFBodyContentsHandler {
-
-        void run(RunProperties runProperties, String contents);
-
-        /**
-         * @param link the link; can be null
-         */
-        void hyperlinkStart(String link);
-
-        void hyperlinkEnd();
-
-        void startParagraph(ParagraphProperties paragraphProperties);
-
-        void endParagraph();
-
-        void startTable();
-
-        void endTable();
-
-        void startTableRow();
-
-        void endTableRow();
-
-        void startTableCell();
-
-        void endTableCell();
-
-        void startSDT();
-
-        void endSDT();
-
-        void startEditedSection(String editor, Date date, EditType editType);
-
-        void endEditedSection();
-
-        boolean getIncludeDeletedText();
-
-        void footnoteReference(String id);
-
-        void endnoteReference(String id);
-
-        boolean getIncludeMoveFromText();
-
-        void embeddedOLERef(String refId);
-
-        void embeddedPicRef(String picFileName, String picDescription);
-
-        void startBookmark(String id, String name);
-
-        void endBookmark(String id);
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index f61fa56..7466d09 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -38,6 +38,7 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.util.SAXHelper;
 import org.apache.poi.xwpf.usermodel.XWPFNumbering;
 import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
 import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
 import org.apache.tika.parser.microsoft.ooxml.RunProperties;
 import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
@@ -182,7 +183,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
         try (InputStream stream = packagePart.getInputStream()) {
             XMLReader reader = SAXHelper.newXMLReader();
-            reader.setContentHandler(new XWPFDocumentXMLBodyHandler(
+            reader.setContentHandler(new OOXMLWordAndPowerPointTextHandler(
                     new XWPFToTextContentHandler(buffer), hyperlinks));
             reader.parse(new InputSource(new CloseShieldInputStream(stream)));
 
@@ -232,7 +233,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         return null;
     }
 
-    private class XWPFToTextContentHandler implements XWPFDocumentXMLBodyHandler.XWPFBodyContentsHandler {
+    private class XWPFToTextContentHandler implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
         private final StringBuilder buffer;
 
         public XWPFToTextContentHandler(StringBuilder buffer) {
@@ -305,7 +306,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         }
 
         @Override
-        public void startEditedSection(String editor, Date date, XWPFDocumentXMLBodyHandler.EditType editType) {
+        public void startEditedSection(String editor, Date date, OOXMLWordAndPowerPointTextHandler.EditType editType) {
 
         }