You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/17 00:47:04 UTC

[1/2] tika git commit: TIKA-2210 -- add experimental SAX parser for pptx -- this is a first cut. More refactoring is in order.

Repository: tika
Updated Branches:
  refs/heads/master 1d9445b40 -> 90cdf1f6a


http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFRunProperties.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFRunProperties.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFRunProperties.java
deleted file mode 100644
index ad2d656..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFRunProperties.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-/**
- * WARNING: This class is mutable.  Make a copy of it
- * if you want persistence!
- */
-
-class XWPFRunProperties {
-    boolean italics = false;
-    boolean bold = false;
-
-    public boolean getItalics() {
-        return italics;
-    }
-
-    public boolean getBold() {
-        return bold;
-    }
-
-    public void setItalics(boolean italics) {
-        this.italics = italics;
-    }
-
-    public void setBold(boolean bold) {
-        this.bold = bold;
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
index ff7a63f..b655200 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
@@ -25,6 +25,7 @@ import java.util.Map;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.ooxml.AbstractDocumentXMLBodyHandler;
 import org.apache.tika.sax.OfflineContentHandler;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
@@ -71,11 +72,11 @@ public class XWPFStylesShim {
 
         @Override
         public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
-            if (uri == null || XWPFDocumentXMLBodyHandler.W_NS.equals(uri)) {
+            if (uri == null || AbstractDocumentXMLBodyHandler.W_NS.equals(uri)) {
                 if ("style".equals(localName)) {
-                    currentStyleId = atts.getValue(XWPFDocumentXMLBodyHandler.W_NS, "styleId");
+                    currentStyleId = atts.getValue(AbstractDocumentXMLBodyHandler.W_NS, "styleId");
                 } else if ("name".equals(localName)) {
-                    String name = atts.getValue(XWPFDocumentXMLBodyHandler.W_NS, "val");
+                    String name = atts.getValue(AbstractDocumentXMLBodyHandler.W_NS, "val");
                     if (currentStyleId != null && name != null) {
                         styles.put(currentStyleId, name);
                     }
@@ -85,7 +86,7 @@ public class XWPFStylesShim {
 
         @Override
         public void endElement(String uri, String localName, String qName) throws SAXException {
-            if (uri == null || XWPFDocumentXMLBodyHandler.W_NS.equals(uri)) {
+            if (uri == null || AbstractDocumentXMLBodyHandler.W_NS.equals(uri)) {
                 if ("style".equals(localName)) {
                     currentStyleId = null;
                 }

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
index e1e91c4..f11cf33 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
@@ -23,6 +23,8 @@ import java.util.Date;
 
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.microsoft.WordExtractor;
+import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
+import org.apache.tika.parser.microsoft.ooxml.RunProperties;
 import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
@@ -61,7 +63,7 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
     }
 
     @Override
-    public void run(XWPFRunProperties runProperties, String contents) {
+    public void run(RunProperties runProperties, String contents) {
         try {
             // True if we are currently in the named style tag:
             if (runProperties.getBold() != isBold) {
@@ -121,7 +123,7 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
     }
 
     @Override
-    public void startParagraph(XWPFParagraphProperties paragraphProperties) {
+    public void startParagraph(ParagraphProperties paragraphProperties) {
         if (pDepth == 0 && tableDepth == 0 && sdtDepth == 0) {
             paragraphTag = P;
             String styleClass = null;

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
new file mode 100644
index 0000000..d8df4c9
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
@@ -0,0 +1,533 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+
+public class SXSLFExtractorTest extends TikaTest {
+
+    private ParseContext parseContext;
+    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+
+    @Before
+    public void setUp() {
+        parseContext = new ParseContext();
+        officeParserConfig.setUseSAXPptxExtractor(true);
+        parseContext.set(OfficeParserConfig.class, officeParserConfig);
+
+    }
+
+    @Test
+    public void basicTest() throws Exception {
+
+        List<Metadata> metadataList = getRecursiveMetadata("testPPT_various2.pptx", parseContext);
+
+        assertEquals("right number of attachments", 10, metadataList.size());
+
+        String mainContent = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
+
+        assertContains("This slide is hidden", mainContent);//TODO: parameterize this
+
+        //basic content
+        assertContains("FirstBullet", mainContent);
+
+        //hyperlink
+        assertContains("<a href=\"http://tika.apache.org/\">tika_hyperlink</a>", mainContent);
+        //hyperlink in cell
+        assertContains("<a href=\"http://lucene.apache.org/\">lucene_hyperlink</a>", mainContent);
+
+        //text box
+        assertContains("Slide2TextBox", mainContent);
+        assertContains("<td>R1c1</td>", mainContent);
+
+        //wordArt
+        assertContains("This is some WordART", mainContent);
+
+        //notes
+        assertContains("NotesForSlide2", mainContent);
+        assertContains("Notes for slide3", mainContent);
+        assertContains("NotesMasterHeader", mainContent);
+        assertContains("NotesMasterFooter", mainContent);
+        assertContains("NotesMasterPageNumber", mainContent);
+        assertContains("NotesWordArt", mainContent);
+        assertContains("NotesWordArtPage2", mainContent);
+        assertContains("NotesTableSlide2", mainContent);
+
+        //comments
+        assertContains(
+                "<p class=\"slide-comment\"><b>Timothy Allison (TA)</b>This is a reply to the initial comment</p>",
+                    mainContent);
+
+        //HandoutMaster
+        assertContains(
+                "HandoutHeader1", mainContent);
+        assertContains("HandoutFooter", mainContent);
+        assertContains("HandoutDate", mainContent);
+        assertContains("TextBoxInHandOut", mainContent);
+
+        //text box in master
+        assertContains("MASTERTEXTBOX", mainContent);
+
+        //equation
+        assertContains("3/4", mainContent);
+
+        //make sure footer elements are in their own <p/>
+        assertContains("<p>12/16/2016</p>", mainContent);
+        assertContains("<p>8</p>", mainContent);
+
+
+        assertContains("<td>NotesTableSlide2", mainContent);
+
+        assertContains("MASTERFOOTERMSG", mainContent);
+
+
+        //should not include boilerplate from master
+        assertNotContained("Click to edit Master", mainContent);
+        assertNotContained("Second level", mainContent);
+
+        //TODO: chart content
+        //assertContains("SLIDE3ChartTitle", mainContent);
+        //assertContains("Category 1", mainContent);
+    }
+
+    @Test
+    public void  poiBug54916Test() throws Exception {
+        String xml = getXML("testPPTX_overlappingRelations.pptx", parseContext).xml;
+        assertContains("POI cannot read this", xml);
+        assertContains("Has a relationship to another slide", xml);
+        assertContains("can read this too", xml);
+    }
+
+    /**
+     * We have a number of different powerpoint files,
+     * such as presentation, macro-enabled etc
+     */
+    @Test
+    public void testPowerPoint() throws Exception {
+        String[] extensions = new String[]{
+                "pptx", "pptm",
+                "ppsm",
+                "ppsx", "potm",
+                //"thmx", // TIKA-418: Will be supported in POI 3.7 beta 2
+                //"xps" // TIKA-418: Not yet supported by POI
+        };
+
+        String[] mimeTypes = new String[]{
+                "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+                "application/vnd.ms-powerpoint.presentation.macroenabled.12",
+                "application/vnd.ms-powerpoint.slideshow.macroenabled.12",
+                "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+                "application/vnd.ms-powerpoint.template.macroenabled.12",
+        };
+
+        for (int i = 0; i < extensions.length; i++) {
+            String extension = extensions[i];
+            String filename = "testPPT." + extension;
+
+            Parser parser = new AutoDetectParser();
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+
+            try (InputStream input = getResourceAsStream("/test-documents/" + filename)) {
+                parser.parse(input, handler, metadata, parseContext);
+
+                assertEquals(
+                        "Mime-type checking for " + filename,
+                        mimeTypes[i],
+                        metadata.get(Metadata.CONTENT_TYPE));
+                assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
+                assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
+                assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
+
+                String content = handler.toString();
+                // Theme files don't have the text in them
+                if (extension.equals("thmx")) {
+                    assertEquals("", content);
+                } else {
+                    assertTrue(
+                            "Text missing for " + filename + "\n" + content,
+                            content.contains("Attachment Test")
+                    );
+                    assertTrue(
+                            "Text missing for " + filename + "\n" + content,
+                            content.contains("This is a test file data with the same content")
+                    );
+                    assertTrue(
+                            "Text missing for " + filename + "\n" + content,
+                            content.contains("content parsing")
+                    );
+                    assertTrue(
+                            "Text missing for " + filename + "\n" + content,
+                            content.contains("Different words to test against")
+                    );
+                    assertTrue(
+                            "Text missing for " + filename + "\n" + content,
+                            content.contains("Mystery")
+                    );
+                }
+            }
+        }
+    }
+
+    /**
+     * Test that the metadata is already extracted when the body is processed.
+     * See TIKA-1109
+     */
+    @Test
+    public void testPowerPointMetadataEarly() throws Exception {
+        String[] extensions = new String[]{
+                "pptx", "pptm", "ppsm", "ppsx", "potm"
+                //"thmx", // TIKA-418: Will be supported in POI 3.7 beta 2
+                //"xps" // TIKA-418: Not yet supported by POI
+        };
+
+        final String[] mimeTypes = new String[]{
+                "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+                "application/vnd.ms-powerpoint.presentation.macroenabled.12",
+                "application/vnd.ms-powerpoint.slideshow.macroenabled.12",
+                "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+                "application/vnd.ms-powerpoint.template.macroenabled.12"
+        };
+
+        for (int i = 0; i < extensions.length; i++) {
+            String extension = extensions[i];
+            final String filename = "testPPT." + extension;
+
+            Parser parser = new AutoDetectParser();
+            final Metadata metadata = new Metadata();
+
+            // Allow the value to be access from the inner class
+            final int currentI = i;
+            ContentHandler handler = new BodyContentHandler() {
+                public void startDocument() {
+                    assertEquals(
+                            "Mime-type checking for " + filename,
+                            mimeTypes[currentI],
+                            metadata.get(Metadata.CONTENT_TYPE));
+                    assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
+                    assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
+                    assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
+
+                }
+
+            };
+
+            try (InputStream input = getResourceAsStream("/test-documents/" + filename)) {
+                parser.parse(input, handler, metadata, parseContext);
+            }
+        }
+    }
+
+    /**
+     * For the PowerPoint formats we don't currently support, ensure that
+     * we don't break either
+     */
+    @Test
+    public void testUnsupportedPowerPoint() throws Exception {
+        String[] extensions = new String[]{"xps", "thmx"};
+        String[] mimeTypes = new String[]{
+                "application/vnd.ms-xpsdocument",
+                "application/vnd.openxmlformats-officedocument" // Is this right?
+        };
+
+        for (int i = 0; i < extensions.length; i++) {
+            String extension = extensions[i];
+            String filename = "testPPT." + extension;
+
+            Parser parser = new AutoDetectParser();
+            Metadata metadata = new Metadata();
+            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+            ContentHandler handler = new BodyContentHandler();
+
+            try (InputStream input = getResourceAsStream("/test-documents/" + filename)) {
+                parser.parse(input, handler, metadata, parseContext);
+
+                // Should get the metadata
+                assertEquals(
+                        "Mime-type checking for " + filename,
+                        mimeTypes[i],
+                        metadata.get(Metadata.CONTENT_TYPE));
+
+                // But that's about it
+            }
+        }
+    }
+
+    @Test
+    public void testVariousPPTX() throws Exception {
+        Metadata metadata = new Metadata();
+        String xml = getXML("testPPT_various.pptx", metadata, parseContext).xml;
+        assertContains("<p>Footnote appears here", xml);
+        assertContains("<p>[1] This is a footnote.", xml);
+        assertContains("<p>This is the header text.</p>", xml);
+        assertContains("<p>This is the footer text.</p>", xml);
+        assertContains("<p>Here is a text box</p>", xml);
+        assertContains("<p>Bold", xml);
+        assertContains("italic underline superscript subscript", xml);
+        assertContains("<p>Here is a citation:", xml);
+        assertContains("Figure 1 This is a caption for Figure 1", xml);
+        assertContains("(Kramer)", xml);
+        assertContains("<table><tr>\t<td>Row 1 Col 1</td>", xml);
+        assertContains("<td>Row 2 Col 2</td>\t<td>Row 2 Col 3</td></tr>", xml);
+        assertContains("<p>Row 1 column 1</p>", xml);
+        assertContains("<p>Row 2 column 2</p>", xml);
+        assertContains("<p><a href=\"http://tika.apache.org/\">This is a hyperlink</a>", xml);
+        assertContains("<p>Here is a list:", xml);
+        for (int row = 1; row <= 3; row++) {
+            //assertContains("�\tBullet " + row, content);
+            //assertContains("\u00b7\tBullet " + row, content);
+            assertContains("<p>Bullet " + row, xml);
+        }
+        assertContains("Here is a numbered list:", xml);
+        for (int row = 1; row <= 3; row++) {
+            //assertContains(row + ")\tNumber bullet " + row, content);
+            //assertContains(row + ") Number bullet " + row, content);
+            // TODO: OOXMLExtractor fails to number the bullets:
+            assertContains("<p>Number bullet " + row, xml);
+        }
+
+        for (int row = 1; row <= 2; row++) {
+            for (int col = 1; col <= 3; col++) {
+                assertContains("Row " + row + " Col " + col, xml);
+            }
+        }
+
+        assertContains("Keyword1 Keyword2", xml);
+        assertEquals("Keyword1 Keyword2",
+                metadata.get(Metadata.KEYWORDS));
+
+        assertContains("Subject is here", xml);
+        // TODO: Remove subject in Tika 2.0
+        assertEquals("Subject is here",
+                metadata.get(Metadata.SUBJECT));
+        assertEquals("Subject is here",
+                metadata.get(OfficeOpenXMLCore.SUBJECT));
+
+        assertContains("Suddenly some Japanese text:", xml);
+        // Special version of (GHQ)
+        assertContains("\uff08\uff27\uff28\uff31\uff09", xml);
+        // 6 other characters
+        assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", xml);
+
+        assertContains("And then some Gothic text:", xml);
+        assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", xml);
+    }
+
+    @Test
+    public void testCommentPPTX() throws Exception {
+        XMLResult r = getXML("testPPT_comment.pptx", parseContext);
+        assertContains("<p class=\"slide-comment\"><b>Allison, Timothy B. (ATB)", r.xml);
+    }
+
+    @Test
+    public void testMasterFooter() throws Exception {
+
+        assertContains("Master footer is here",
+                getXML("testPPT_masterFooter.pptx", parseContext).xml);
+    }
+
+    /**
+     * TIKA-712 Master Slide Text from PPT and PPTX files
+     * should be extracted too
+     */
+    @Test
+    public void testMasterText() throws Exception {
+        assertContains("Text that I added to the master slide",
+                getXML("testPPT_masterText.pptx", parseContext).xml);
+    }
+
+    @Test
+    public void testMasterText2() throws Exception {
+        assertContains("Text that I added to the master slide",
+                getXML("testPPT_masterText2.pptx", parseContext).xml);
+    }
+
+    @Test
+    public void testWordArt() throws Exception {
+        assertContains("Here is some red word Art",
+                getXML("testWordArt.pptx", parseContext).xml);
+    }
+
+    @Test
+    public void testPowerPointCustomProperties() throws Exception {
+        Metadata metadata = new Metadata();
+
+        ParseContext context = new ParseContext();
+        context.set(Locale.class, Locale.US);
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setUseSAXPptxExtractor(true);
+        context.set(OfficeParserConfig.class, officeParserConfig);
+
+        getXML("testPPT_custom_props.pptx", metadata, parseContext);
+        assertEquals(
+                "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER));
+        assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR));
+        assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE));
+        assertEquals("2011-08-22T13:32:49Z", metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("2011-08-22T13:32:49Z", metadata.get(Metadata.DATE));
+        assertEquals("1", metadata.get(Office.SLIDE_COUNT));
+        assertEquals("3", metadata.get(Office.WORD_COUNT));
+        assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("true", metadata.get("custom:myCustomBoolean"));
+        assertEquals("3", metadata.get("custom:myCustomNumber"));
+        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+        assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
+        assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
+    }
+
+    // TIKA-997:
+    @Test
+    @Ignore("TODO: add in embedded file markup")
+    public void testEmbeddedZipInPPTX() throws Exception {
+        String xml = getXML("test_embedded_zip.pptx", parseContext).xml;
+        int h = xml.indexOf("<div class=\"embedded\" id=\"slide1_rId3\" />");
+        int i = xml.indexOf("Send me a note");
+        int j = xml.indexOf("<div class=\"embedded\" id=\"slide2_rId4\" />");
+        int k = xml.indexOf("<p>No title</p>");
+        assertTrue(h != -1);
+        assertTrue(i != -1);
+        assertTrue(j != -1);
+        assertTrue(k != -1);
+        assertTrue(h < i);
+        assertTrue(i < j);
+        assertTrue(j < k);
+    }
+
+    // TIKA-1032:
+    @Test
+    @Ignore("TODO: add in embedded file markup")
+    public void testEmbeddedPPTXTwoSlides() throws Exception {
+        String xml = getXML("testPPT_embedded_two_slides.pptx", parseContext).xml;
+        assertContains("<div class=\"embedded\" id=\"slide1_rId7\" />", xml);
+        assertContains("<div class=\"embedded\" id=\"slide2_rId7\" />", xml);
+    }
+
+    //TIKA-817
+    @Test
+    public void testPPTXAutodate() throws Exception {
+        //Following POI-52368, the stored date is extracted,
+        //not the auto-generated date.
+
+        XMLResult result = getXML("testPPT_autodate.pptx", parseContext);
+        assertContains("<p>Now</p>\n" +
+                "<p>2011-12-19 10:20:04 AM</p>\n", result.xml);
+
+    }
+
+    @Test
+    public void testPPTXThumbnail() throws Exception {
+        String xml = getXML("testPPTX_Thumbnail.pptx", parseContext).xml;
+        int a = xml.indexOf("<body><div class=\"slide-content\"><p>This file contains an embedded thumbnail");
+        int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.jpeg\" />");
+        assertTrue(a != -1);
+        assertTrue(b != -1);
+        assertTrue(a < b);
+    }
+
+    @Test
+    public void testEncrypted() throws Exception {
+        Map<String, String> tests = new HashMap<String, String>();
+        tests.put("testPPT_protected_passtika.pptx",
+                "This is an encrypted PowerPoint 2007 slide.");
+
+        Parser parser = new AutoDetectParser();
+        Metadata m = new Metadata();
+        PasswordProvider passwordProvider = new PasswordProvider() {
+            @Override
+            public String getPassword(Metadata metadata) {
+                return "tika";
+            }
+        };
+        ParseContext passwordContext = new ParseContext();
+        passwordContext.set(org.apache.tika.parser.PasswordProvider.class, passwordProvider);
+        passwordContext.set(OfficeParserConfig.class, officeParserConfig);
+
+        for (Map.Entry<String, String> e : tests.entrySet()) {
+            try (InputStream is = getResourceAsStream("/test-documents/"+e.getKey())) {
+                ContentHandler handler = new BodyContentHandler();
+                parser.parse(is, handler, m, passwordContext);
+                assertContains(e.getValue(), handler.toString());
+            }
+        }
+
+        ParseContext context = new ParseContext();
+        //now try with no password
+        for (Map.Entry<String, String> e : tests.entrySet()) {
+            boolean exc = false;
+            try (InputStream is = getResourceAsStream("/test-documents/"+e.getKey())) {
+                ContentHandler handler = new BodyContentHandler();
+                parser.parse(is, handler, m, context);
+            } catch (EncryptedDocumentException ex) {
+                exc = true;
+            }
+            assertTrue(exc);
+        }
+
+    }
+
+    @Test
+    public void testEmbeddedPDFInPPTX() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", parseContext);
+        Metadata pdfMetadata1 = metadataList.get(4);
+        assertContains("Apache Tika", pdfMetadata1.get(RecursiveParserWrapper.TIKA_CONTENT));
+        Metadata pdfMetadata2 = metadataList.get(5);
+        assertContains("Hello World", pdfMetadata2.get(RecursiveParserWrapper.TIKA_CONTENT));
+    }
+
+    @Test
+    public void testMacrosInPptm() throws Exception {
+        Metadata minExpected = new Metadata();
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
+        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
+        minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
+        minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+
+        assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm", parseContext));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/test/resources/test-documents/testPPTX_overlappingRelations.pptx
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testPPTX_overlappingRelations.pptx b/tika-parsers/src/test/resources/test-documents/testPPTX_overlappingRelations.pptx
new file mode 100644
index 0000000..7551630
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPTX_overlappingRelations.pptx differ

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/test/resources/test-documents/testPPT_various2.pptx
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_various2.pptx b/tika-parsers/src/test/resources/test-documents/testPPT_various2.pptx
new file mode 100644
index 0000000..1f1daf0
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_various2.pptx differ


[2/2] tika git commit: TIKA-2210 -- add experimental SAX parser for pptx -- this is a first cut. More refactoring is in order.

Posted by ta...@apache.org.
TIKA-2210 -- add experimental SAX parser for pptx -- this is a first cut.  More refactoring is in order.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/90cdf1f6
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/90cdf1f6
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/90cdf1f6

Branch: refs/heads/master
Commit: 90cdf1f6a844e0d0541167bc0364bb3963f93b2d
Parents: 1d9445b
Author: tballison <ta...@mitre.org>
Authored: Fri Dec 16 19:46:55 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Fri Dec 16 19:46:55 2016 -0500

----------------------------------------------------------------------
 CHANGES.txt                                     |   3 +
 .../src/test/java/org/apache/tika/TikaTest.java |   4 +
 .../parser/microsoft/AbstractOfficeParser.java  |   4 +
 .../parser/microsoft/OfficeParserConfig.java    |  19 +-
 .../ooxml/AbstractDocumentXMLBodyHandler.java   |  99 ++++
 .../microsoft/ooxml/AbstractOOXMLExtractor.java |  74 +++
 .../microsoft/ooxml/MetadataExtractor.java      |   4 +-
 .../microsoft/ooxml/OOXMLExtractorFactory.java  |  47 +-
 .../microsoft/ooxml/ParagraphProperties.java    |  56 ++
 .../parser/microsoft/ooxml/RunProperties.java   |  44 ++
 .../SXSLFPowerPointExtractorDecorator.java      | 428 +++++++++++++++
 .../ooxml/SXWPFWordExtractorDecorator.java      |  56 +-
 .../ooxml/xslf/XSLFDocumentXMLBodyHandler.java  | 330 ++++++++++++
 .../xslf/XSLFEventBasedPowerPointExtractor.java | 161 ++++++
 .../ooxml/xslf/XSLFTikaBodyPartHandler.java     | 262 +++++++++
 .../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java  |  76 +--
 .../ooxml/xwpf/XWPFEventBasedWordExtractor.java |  28 +-
 .../ooxml/xwpf/XWPFParagraphProperties.java     |  56 --
 .../microsoft/ooxml/xwpf/XWPFRunProperties.java |  44 --
 .../microsoft/ooxml/xwpf/XWPFStylesShim.java    |   9 +-
 .../ooxml/xwpf/XWPFTikaBodyPartHandler.java     |   6 +-
 .../microsoft/ooxml/SXSLFExtractorTest.java     | 533 +++++++++++++++++++
 .../testPPTX_overlappingRelations.pptx          | Bin 0 -> 38135 bytes
 .../test-documents/testPPT_various2.pptx        | Bin 0 -> 248937 bytes
 24 files changed, 2083 insertions(+), 260 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 90823c6..e215499 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.15 - ??
 
+  * Added experimental SAX parser for .pptx files. To select this parser,
+    set useSAXPptxExtractor(true) on OfficeParserConfig (TIKA-2210).
+
   * Upgrade to PDFBox 2.0.4 (TIKA-2209).
 
   * Refactor MockParser to consolidate service loading

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-core/src/test/java/org/apache/tika/TikaTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 6644d86..11eb801 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -173,6 +173,10 @@ public abstract class TikaTest {
         return getXML(filePath, new AutoDetectParser(), parseContext);
     }
 
+    protected XMLResult getXML(String filePath, Metadata metadata, ParseContext parseContext) throws Exception {
+        return getXML(getResourceAsStream("/test-documents/"+filePath), new AutoDetectParser(), metadata, parseContext);
+    }
+
     protected XMLResult getXML(String filePath, Metadata metadata) throws Exception {
         return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), metadata, null);
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
index d8186bc..e01fe0c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -66,4 +66,8 @@ public abstract class AbstractOfficeParser extends AbstractParser {
         defaultOfficeParserConfig.setUseSAXDocxExtractor(useSAXDocxExtractor);
     }
 
+    @Field
+    public void setUseSAXPptxExtractor(boolean useSAXPptxExtractor) {
+        defaultOfficeParserConfig.setUseSAXPptxExtractor(useSAXPptxExtractor);
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index f3cdbfe..05275d7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -25,6 +25,7 @@ public class OfficeParserConfig implements Serializable {
     private boolean includeMoveFromContent = false;
 
     private boolean useSAXDocxExtractor = false;
+    private boolean useSAXPptxExtractor = false;
 
     /**
      * Sets whether or not the parser should include deleted content.
@@ -71,12 +72,28 @@ public class OfficeParserConfig implements Serializable {
      * If set to <code>false</code>, the classic parser will be used; if <code>true</code>,
      * the new experimental parser will be used.
      * <p/>
-     * Default: classic parser
+     * Default: <code>false</code> (classic DOM parser)
      * @param useSAXDocxExtractor
      */
     public void setUseSAXDocxExtractor(boolean useSAXDocxExtractor) {
         this.useSAXDocxExtractor = useSAXDocxExtractor;
     }
+
+    /**
+     * Use the experimental SAX-based streaming DOCX parser?
+     * If set to <code>false</code>, the classic parser will be used; if <code>true</code>,
+     * the new experimental parser will be used.
+     * <p/>
+     * Default: <code>false</code> (classic DOM parser)
+     * @param useSAXPptxExtractor
+     */
+    public void setUseSAXPptxExtractor(boolean useSAXPptxExtractor) {
+        this.useSAXPptxExtractor = useSAXPptxExtractor;
+    }
+
+    public boolean getUseSAXPptxExtractor() {
+        return useSAXPptxExtractor;
+    }
 }
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractDocumentXMLBodyHandler.java
new file mode 100644
index 0000000..5037fd2
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractDocumentXMLBodyHandler.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml;
+
+
+import org.xml.sax.helpers.DefaultHandler;
+
+public class AbstractDocumentXMLBodyHandler extends DefaultHandler {
+
+    protected final static String R = "r";
+    protected final static String FLD = "fld";
+    protected final static String RPR = "rPr";
+    protected final static String P = "p";
+    protected static String P_STYLE = "pStyle";
+    protected final static String PPR = "pPr";
+    protected static String T = "t";
+    protected final static String TAB = "tab";
+    protected final static String B = "b";
+    protected final static String ILVL = "ilvl";
+    protected final static String NUM_ID = "numId";
+    protected final static String TC = "tc";
+    protected final static String TR = "tr";
+    protected final static String I = "i";
+    protected final static String NUM_PR = "numPr";
+    protected final static String BR = "br";
+    protected final static String HYPERLINK = "hyperlink";
+    protected final static String TBL = "tbl";
+    protected final static String PIC = "pic";
+    protected final static String PICT = "pict";
+    protected final static String IMAGEDATA = "imagedata";
+    protected final static String BLIP = "blip";
+    protected final static String CHOICE = "Choice";
+    protected final static String FALLBACK = "Fallback";
+    protected final static String OLE_OBJECT = "OLEObject";
+    protected final static String CR = "cr";
+
+    public final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
+    protected final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
+    protected final static String O_NS = "urn:schemas-microsoft-com:office:office";
+    protected final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
+    protected final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
+    protected final static String V_NS = "urn:schemas-microsoft-com:vml";
+
+    protected final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
+
+    protected final static char[] TAB_CHAR = new char[1];
+    protected final static char NEWLINE = '\n';
+
+    static {
+        TAB_CHAR[0] = '\t';
+    }
+
+    protected boolean inR = false;//in run or in field
+    protected boolean inT = false;
+    protected boolean inRPr = false;
+    protected boolean inNumPr = false;
+
+    protected boolean inPic = false;
+    boolean inPict = false;
+    protected String picDescription = null;
+    protected String picRId = null;
+    String picFilename = null;
+
+    //mechanism used to determine when to
+    //signal the start of the p, and still
+    //handle p with pPr and those without
+    protected boolean lastStartElementWasP = false;
+    //have we signaled the start of a p?
+    //pPr can happen multiple times within a p
+    //<p><pPr/><r><t>text</t></r><pPr></p>
+    protected boolean pStarted = false;
+
+    //alternate content can be embedded in itself.
+    //need to track depth.
+    //if in alternate, choose fallback, maybe make this configurable?
+    protected int inACChoiceDepth = 0;
+    protected int inACFallbackDepth = 0;
+
+    protected RunProperties currRunProperties = new RunProperties();
+    protected ParagraphProperties currPProperties = new ParagraphProperties();
+
+    protected final StringBuilder runBuffer = new StringBuilder();
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 6bc867d..a56d43b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -18,12 +18,15 @@ package org.apache.tika.parser.microsoft.ooxml;
 
 import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
 
+import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URI;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 
 import org.apache.poi.POIXMLDocument;
@@ -32,13 +35,16 @@ import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
 import org.apache.poi.openxml4j.opc.TargetMode;
+import org.apache.poi.openxml4j.opc.internal.FileHelper;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -64,6 +70,9 @@ import org.xml.sax.helpers.AttributesImpl;
  * populates the {@link XHTMLContentHandler} object received as parameter.
  */
 public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
+
+
+
     static final String RELATION_AUDIO = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/audio";
     static final String RELATION_IMAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
     static final String RELATION_OLE_OBJECT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject";
@@ -72,6 +81,15 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
     static final String RELATION_OFFICE_DOCUMENT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
     private static final String TYPE_OLE_OBJECT =
             "application/vnd.openxmlformats-officedocument.oleObject";
+
+    protected final static String[] EMBEDDED_RELATIONSHIPS = new String[]{
+            RELATION_AUDIO,
+            RELATION_IMAGE,
+            RELATION_PACKAGE,
+            RELATION_OFFICE_DOCUMENT
+    };
+
+
     private final EmbeddedDocumentExtractor embeddedExtractor;
     protected POIXMLTextExtractor extractor;
 
@@ -344,4 +362,60 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
             throw new TikaException("Broken OOXML file", e);
         }
     }
+
+    /**
+     * This is used by the SAX docx and pptx decorators to load hyperlinks and
+     * other linked objects
+     *
+     * @param bodyPart
+     * @return
+     */
+    protected Map<String, String> loadLinkedRelationships(PackagePart bodyPart, boolean includeInternal) {
+        Map<String, String> linkedRelationships = new HashMap<>();
+        try {
+            PackageRelationshipCollection prc = bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
+            for (int i = 0; i < prc.size(); i++) {
+                PackageRelationship pr = prc.getRelationship(i);
+                if (pr == null) {
+                    continue;
+                }
+                if (! includeInternal && TargetMode.INTERNAL.equals(pr.getTargetMode())) {
+                    continue;
+                }
+                String id = pr.getId();
+                String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
+                if (id != null && url != null) {
+                    linkedRelationships.put(id, url);
+                }
+            }
+
+            for (String rel : EMBEDDED_RELATIONSHIPS) {
+
+                prc = bodyPart.getRelationshipsByType(rel);
+                for (int i = 0; i < prc.size(); i++) {
+                    PackageRelationship pr = prc.getRelationship(i);
+                    if (pr == null) {
+                        continue;
+                    }
+                    String id = pr.getId();
+                    String uriString = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
+                    String fileName = uriString;
+                    if (pr.getTargetURI() != null) {
+                        try {
+                            fileName = FileHelper.getFilename(new File(fileName));
+                        } catch (Exception e) {
+                            fileName = uriString;
+                        }
+                    }
+                    if (id != null) {
+                        fileName = (fileName == null) ? "" : fileName;
+                        linkedRelationships.put(id, fileName);
+                    }
+                }
+            }
+
+        } catch (InvalidFormatException e) {
+        }
+        return linkedRelationships;
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index d392346..21c6252 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -36,6 +36,7 @@ import org.apache.tika.metadata.PagedText;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.microsoft.SummaryExtractor;
+import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
 import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
 import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
@@ -59,7 +60,8 @@ public class MetadataExtractor {
     public void extract(Metadata metadata) throws TikaException {
         if (extractor.getDocument() != null ||
                 ((extractor instanceof XSSFEventBasedExcelExtractor ||
-                        extractor instanceof XWPFEventBasedWordExtractor) &&
+                        extractor instanceof XWPFEventBasedWordExtractor ||
+                        extractor instanceof XSLFEventBasedPowerPointExtractor) &&
                         extractor.getPackage() != null)) {
             extractMetadata(extractor.getCoreProperties(), metadata);
             extractMetadata(extractor.getExtendedProperties(), metadata);

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index bbee6b7..30ed1ec 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -30,8 +30,8 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackageAccess;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
 import org.apache.poi.xslf.usermodel.XMLSlideShow;
+import org.apache.poi.xslf.usermodel.XSLFRelation;
 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
@@ -43,6 +43,7 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
 import org.apache.tika.parser.pkg.ZipContainerDetector;
 import org.apache.xmlbeans.XmlException;
@@ -93,6 +94,9 @@ public class OOXMLExtractorFactory {
             if (config.getUseSAXDocxExtractor()) {
                 poiExtractor = trySXWPF(pkg);
             }
+            if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
+                poiExtractor = trySXSLF(pkg);
+            }
             if (poiExtractor == null) {
                 poiExtractor = ExtractorFactory.createExtractor(pkg);
             }
@@ -103,7 +107,12 @@ public class OOXMLExtractorFactory {
                         context, (XSSFEventBasedExcelExtractor) poiExtractor, locale);
             } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
                 extractor = new SXWPFWordExtractorDecorator(context,
-                        (XWPFEventBasedWordExtractor)poiExtractor);
+                        (XWPFEventBasedWordExtractor) poiExtractor);
+                metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getSimpleName());
+            } else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
+                extractor = new SXSLFPowerPointExtractorDecorator(context,
+                        (XSLFEventBasedPowerPointExtractor) poiExtractor);
+                metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getSimpleName());
             } else if (document == null) {
                 throw new TikaException(
                         "Expecting UserModel based POI OOXML extractor with a document, but none found. " +
@@ -111,7 +120,7 @@ public class OOXMLExtractorFactory {
                 );
             } else if (document instanceof XMLSlideShow) {
                 extractor = new XSLFPowerPointExtractorDecorator(
-                        context, (XSLFPowerPointExtractor) poiExtractor);
+                        context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
             } else if (document instanceof XWPFDocument) {
                 extractor = new XWPFWordExtractorDecorator(
                         context, (XWPFWordExtractor) poiExtractor);
@@ -119,6 +128,7 @@ public class OOXMLExtractorFactory {
                 extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
             }
 
+
             // Get the bulk of the metadata first, so that it's accessible during
             //  parsing if desired by the client (see TIKA-1109)
             extractor.getMetadataExtractor().extract(metadata);
@@ -146,7 +156,7 @@ public class OOXMLExtractorFactory {
 
     private static POIXMLTextExtractor trySXWPF(OPCPackage pkg) throws XmlException, OpenXML4JException, IOException {
         PackageRelationshipCollection packageRelationshipCollection = pkg.getRelationshipsByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument");
-        if(packageRelationshipCollection.size() == 0) {
+        if (packageRelationshipCollection.size() == 0) {
             packageRelationshipCollection = pkg.getRelationshipsByType("http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument");
         }
 
@@ -163,4 +173,33 @@ public class OOXMLExtractorFactory {
         return null;
     }
 
+    private static POIXMLTextExtractor trySXSLF(OPCPackage pkg) throws XmlException, OpenXML4JException, IOException {
+
+        PackageRelationshipCollection packageRelationshipCollection = pkg.getRelationshipsByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument");
+        if (packageRelationshipCollection.size() == 0) {
+            packageRelationshipCollection = pkg.getRelationshipsByType("http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument");
+        }
+
+        if (packageRelationshipCollection.size() == 0) {
+            return null;
+        }
+        PackagePart corePart = pkg.getPart(packageRelationshipCollection.getRelationship(0));
+        String targetContentType = corePart.getContentType();
+
+        XSLFRelation[] xslfRelations = org.apache.poi.xslf.extractor.XSLFPowerPointExtractor.SUPPORTED_TYPES;
+
+        for (int i = 0; i < xslfRelations.length; i++) {
+            XSLFRelation xslfRelation = xslfRelations[i];
+            if (xslfRelation.getContentType().equals(targetContentType)) {
+                return new XSLFEventBasedPowerPointExtractor(pkg);
+            }
+        }
+
+        if (XSLFRelation.THEME_MANAGER.getContentType().equals(targetContentType)) {
+            return new XSLFEventBasedPowerPointExtractor(pkg);
+        }
+        return null;
+    }
+
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/ParagraphProperties.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/ParagraphProperties.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/ParagraphProperties.java
new file mode 100644
index 0000000..62ee31e
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/ParagraphProperties.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml;
+
+
+public class ParagraphProperties {
+
+    private String styleId;
+    private int ilvl = -1;
+    private int numId = -1;
+
+    public String getStyleID() {
+        return  styleId;
+    }
+
+    public void setStyleID(String styleId) {
+        this.styleId = styleId;
+    }
+
+    public void reset() {
+        styleId = null;
+        ilvl = -1;
+        numId = -1;
+    }
+
+    public void setIlvl(int ilvl) {
+        this.ilvl = ilvl;
+    }
+
+    public void setNumId(int numId) {
+        this.numId = numId;
+    }
+
+    public int getIlvl() {
+        return ilvl;
+    }
+
+    public int getNumId() {
+        return numId;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
new file mode 100644
index 0000000..9fbfcd8
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml;
+
+/**
+ * WARNING: This class is mutable.  Make a copy of it
+ * if you want persistence!
+ */
+
+public class RunProperties {
+    boolean italics = false;
+    boolean bold = false;
+
+    public boolean getItalics() {
+        return italics;
+    }
+
+    public boolean getBold() {
+        return bold;
+    }
+
+    public void setItalics(boolean italics) {
+        this.italics = italics;
+    }
+
+    public void setBold(boolean bold) {
+        this.bold = bold;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
new file mode 100644
index 0000000..1ab8bd3
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -0,0 +1,428 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackagePartName;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.PackagingURIHelper;
+import org.apache.poi.openxml4j.opc.TargetMode;
+import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.usermodel.XSLFRelation;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFDocumentXMLBodyHandler;
+import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
+import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFTikaBodyPartHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * SAX/Streaming pptx extractior
+ */
+public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
+
+    private final static String HANDOUT_MASTER = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/handoutMaster";
+
+    //a pptx file should have one of these "main story" parts
+    private final static String[] MAIN_STORY_PART_RELATIONS = new String[]{
+            XSLFRelation.MAIN.getContentType(),
+            XSLFRelation.PRESENTATION_MACRO.getContentType(),
+            XSLFRelation.PRESENTATIONML.getContentType(),
+            XSLFRelation.PRESENTATIONML_TEMPLATE.getContentType(),
+            XSLFRelation.MACRO.getContentType(),
+            XSLFRelation.MACRO_TEMPLATE.getContentType(),
+            XSLFRelation.THEME_MANAGER.getContentType()
+
+
+            //TODO: what else
+    };
+
+    private final OPCPackage opcPackage;
+    private final ParseContext context;
+    private PackagePart mainDocument = null;
+    private final CommentAuthors commentAuthors = new CommentAuthors();
+
+    public SXSLFPowerPointExtractorDecorator(ParseContext context, XSLFEventBasedPowerPointExtractor extractor) {
+        super(context, extractor);
+        this.context = context;
+        this.opcPackage = extractor.getPackage();
+        for (String contentType : MAIN_STORY_PART_RELATIONS) {
+            List<PackagePart> pps = opcPackage.getPartsByContentType(contentType);
+            if (pps.size() > 0) {
+                mainDocument = pps.get(0);
+                break;
+            }
+        }
+        //if mainDocument == null, throw exception
+    }
+
+    /**
+     * @see XSLFPowerPointExtractor#getText()
+     */
+    protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
+
+        loadCommentAuthors();
+
+        //TODO: should check for custShowLst and order based on sldLst
+        try {
+
+            PackageRelationshipCollection prc = mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
+            if (prc.size() == 0) {
+
+            }
+            for (int i = 0; i < prc.size(); i++) {
+                handleSlidePart(mainDocument.getRelatedPart(prc.getRelationship(i)), xhtml);
+            }
+        } catch (InvalidFormatException e) {
+        }
+        handleBasicRelatedParts(XSLFRelation.SLIDE_MASTER.getRelation(),
+                "slide-master",
+                mainDocument,
+                new PlaceHolderSkipper(new XSLFDocumentXMLBodyHandler(
+                        new XSLFTikaBodyPartHandler(xhtml), new HashMap<String, String>())));
+
+        handleBasicRelatedParts(HANDOUT_MASTER,
+                "slide-handout-master",
+                mainDocument,
+                new XSLFDocumentXMLBodyHandler(
+                        new XSLFTikaBodyPartHandler(xhtml), new HashMap<String, String>())
+        );
+    }
+
+    private void loadCommentAuthors() {
+        PackageRelationshipCollection prc = null;
+        try {
+            prc = mainDocument.getRelationshipsByType(XSLFRelation.COMMENT_AUTHORS.getRelation());
+        } catch (InvalidFormatException e) {
+        }
+        if (prc == null || prc.size() == 0) {
+            return;
+        }
+
+        for (int i = 0; i < prc.size(); i++) {
+            PackagePart commentAuthorsPart = null;
+            try {
+                commentAuthorsPart = commentAuthorsPart = mainDocument.getRelatedPart(prc.getRelationship(i));
+            } catch (InvalidFormatException e) {
+
+            }
+            if (commentAuthorsPart == null) {
+                continue;
+            }
+            try (InputStream stream = commentAuthorsPart.getInputStream()) {
+                context.getSAXParser().parse(
+                        new CloseShieldInputStream(stream),
+                        new OfflineContentHandler(new XSLFCommentAuthorHandler()));
+
+            } catch (TikaException | SAXException | IOException e) {
+                //do something with this
+            }
+        }
+
+    }
+
+    private void handleSlidePart(PackagePart slidePart, XHTMLContentHandler xhtml) throws IOException, SAXException {
+        Map<String, String> linkedRelationships = loadLinkedRelationships(slidePart, false);
+
+//        Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
+        xhtml.startElement("div", "class", "slide-content");
+        try (InputStream stream = slidePart.getInputStream()) {
+            context.getSAXParser().parse(
+                    new CloseShieldInputStream(stream),
+                    new OfflineContentHandler(new EmbeddedContentHandler(
+                            new XSLFDocumentXMLBodyHandler(
+                                    new XSLFTikaBodyPartHandler(xhtml), linkedRelationships))));
+
+        } catch (TikaException e) {
+            //do something with this
+        }
+
+        xhtml.endElement("div");
+
+
+        handleBasicRelatedParts(XSLFRelation.SLIDE_LAYOUT.getRelation(),
+                "slide-master-content", slidePart,
+                new PlaceHolderSkipper(new XSLFDocumentXMLBodyHandler(
+                        new XSLFTikaBodyPartHandler(xhtml), linkedRelationships))
+                );
+
+        handleBasicRelatedParts(XSLFRelation.NOTES.getRelation(),
+                "slide-notes", slidePart,
+                new XSLFDocumentXMLBodyHandler(
+                        new XSLFTikaBodyPartHandler(xhtml), linkedRelationships));
+
+        handleBasicRelatedParts(XSLFRelation.NOTES_MASTER.getRelation(),
+                "slide-notes-master", slidePart,
+                new XSLFDocumentXMLBodyHandler(
+                        new XSLFTikaBodyPartHandler(xhtml), linkedRelationships));
+
+        handleBasicRelatedParts(XSLFRelation.COMMENTS.getRelation(),
+                null, slidePart,
+                new XSLFCommentsHandler(xhtml));
+
+//        handleBasicRelatedParts("");
+    }
+
+    /**
+     * This should handle the comments, master, notes, etc
+     *
+     * @param contentType
+     * @param xhtmlClassLabel
+     * @param parentPart
+     * @param contentHandler
+     */
+    private void handleBasicRelatedParts(String contentType, String xhtmlClassLabel,
+                                         PackagePart parentPart, ContentHandler contentHandler) throws SAXException {
+
+        PackageRelationshipCollection relatedPartPRC = null;
+
+        try {
+            relatedPartPRC = parentPart.getRelationshipsByType(contentType);
+        } catch (InvalidFormatException e) {
+            //swallow
+        }
+        if (relatedPartPRC != null && relatedPartPRC.size() > 0) {
+            AttributesImpl attributes = new AttributesImpl();
+
+            attributes.addAttribute("", "class", "class", "CDATA", xhtmlClassLabel);
+            contentHandler.startElement("", "div", "div", attributes);
+            for (int i = 0; i < relatedPartPRC.size(); i++) {
+                PackageRelationship relatedPartPackageRelationship = relatedPartPRC.getRelationship(i);
+                try {
+                    PackagePart relatedPartPart = parentPart.getRelatedPart(relatedPartPackageRelationship);
+                    try (InputStream stream = relatedPartPart.getInputStream()) {
+                        context.getSAXParser().parse(stream,
+                                new OfflineContentHandler(new EmbeddedContentHandler(contentHandler)));
+
+                    } catch (IOException|TikaException e) {
+                        //do something with this
+                    }
+
+                } catch (InvalidFormatException e) {
+                }
+            }
+            contentHandler.endElement("", "div", "div");
+        }
+
+    }
+
+    /**
+     * In PowerPoint files, slides have things embedded in them,
+     * and slide drawings which have the images
+     */
+    @Override
+    protected List<PackagePart> getMainDocumentParts() {
+        List<PackagePart> parts = new ArrayList<>();
+        //TODO: consider: getPackage().getPartsByName(Pattern.compile("/ppt/embeddings/.*?
+        //TODO: consider: getPackage().getPartsByName(Pattern.compile("/ppt/media/.*?
+        try {
+            PackageRelationshipCollection prc = mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
+            for (int i = 0; i < prc.size(); i++) {
+                PackagePart slidePart = mainDocument.getRelatedPart(prc.getRelationship(i));
+                for (PackageRelationship rel : slidePart.getRelationshipsByType(XSLFRelation.VML_DRAWING.getRelation())) {
+                    if (rel.getTargetMode() == TargetMode.INTERNAL) {
+                        PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
+                        parts.add(rel.getPackage().getPart(relName));
+                    }
+                }
+                parts.add(slidePart);
+            }
+        } catch (InvalidFormatException e) {
+            //do something
+        }
+        parts.add(mainDocument);
+        return parts;
+    }
+
+    private class XSLFCommentsHandler extends DefaultHandler {
+
+        private String commentAuthorId = null;
+        private StringBuilder commentBuffer = new StringBuilder();
+        private XHTMLContentHandler xhtml;
+        XSLFCommentsHandler(XHTMLContentHandler xhtml) {
+            this.xhtml = xhtml;
+        }
+
+        @Override
+        public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+            if ("cm".equals(localName)) {
+                commentAuthorId = atts.getValue("", "authorId");
+                //get date (dt)?
+            }
+        }
+
+        @Override
+        public void characters(char[] ch, int start, int length) throws SAXException {
+            //TODO: require that we're in <p:text>?
+            commentBuffer.append(ch, start, length);
+        }
+
+        @Override
+        public void endElement(String uri, String localName, String qName) throws SAXException {
+            if ("cm".equals(localName)) {
+
+                xhtml.startElement("p", "class", "slide-comment");
+
+                String authorString = commentAuthors.getName(commentAuthorId);
+                String authorInitials = commentAuthors.getInitials(commentAuthorId);
+                if (authorString != null || authorInitials != null) {
+                    xhtml.startElement("b");
+                    boolean authorExists = false;
+                    if (authorString != null) {
+                        xhtml.characters(authorString.toString());
+                        authorExists = true;
+                    }
+                    if (authorExists && authorInitials != null) {
+                        xhtml.characters(" (");
+                    }
+                    if (authorInitials != null) {
+                        xhtml.characters(authorInitials);
+                    }
+                    if (authorExists && authorInitials != null) {
+                        xhtml.characters(")");
+                    }
+                    xhtml.endElement("b");
+                }
+                xhtml.characters(commentBuffer.toString());
+                xhtml.endElement("p");
+
+                commentBuffer.setLength(0);
+                commentAuthorId = null;
+            }
+        }
+    }
+
+    private class XSLFCommentAuthorHandler extends DefaultHandler {
+        String id = null;
+        String name = null;
+        String initials = null;
+        @Override
+        public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+            if ("cmAuthor".equals(localName)) {
+                for (int i = 0; i < atts.getLength(); i++) {
+                    if ("id".equals(atts.getLocalName(i))) {
+                        id = atts.getValue(i);
+                    } else if ("name".equals(atts.getLocalName(i))) {
+                        name = atts.getValue(i);
+                    } else if ("initials".equals(atts.getLocalName(i))) {
+                        initials = atts.getValue(i);
+                    }
+                }
+                commentAuthors.add(id, name, initials);
+                //clear out
+                id = null; name = null; initials = null;
+            }
+        }
+
+    }
+
+
+    private static class PlaceHolderSkipper extends DefaultHandler {
+
+        private final XSLFDocumentXMLBodyHandler wrappedHandler;
+
+        PlaceHolderSkipper(XSLFDocumentXMLBodyHandler wrappedHandler) {
+            this.wrappedHandler = wrappedHandler;
+        }
+
+        boolean inPH = false;
+        @Override
+        public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+            if ("ph".equals(localName)) {
+                inPH = true;
+            }
+            if (! inPH) {
+                wrappedHandler.startElement(uri, localName, qName, atts);
+            }
+        }
+
+        @Override
+        public void endElement(String uri, String localName, String qName) throws SAXException {
+
+            if (! inPH) {
+                wrappedHandler.endElement(uri, localName, qName);
+            }
+            if ("sp".equals(localName)) {
+                inPH = false;
+            }
+        }
+        @Override
+        public void characters(char[] ch, int start, int length) throws SAXException {
+            if (! inPH) {
+                wrappedHandler.characters(ch, start, length);
+            }
+        }
+
+        @Override
+        public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+            if (! inPH) {
+                wrappedHandler.characters(ch, start, length);
+            }
+        }
+
+
+    }
+
+    private class CommentAuthors {
+        Map<String, String> nameMap = new HashMap<>();
+        Map<String, String> initialMap = new HashMap<>();
+
+        void add(String id, String name, String initials) {
+            if (id == null) {
+                return;
+            }
+            if (name != null) {
+                nameMap.put(id, name);
+            }
+            if (initials != null) {
+                initialMap.put(id, initials);
+            }
+        }
+
+        String getName(String id) {
+            if (id == null) {
+                return null;
+            }
+            return nameMap.get(id);
+        }
+
+        String getInitials(String id) {
+            if (id == null) {
+                return null;
+            }
+            return initialMap.get(id);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 70c7399..d60b274 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -16,11 +16,9 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
-import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
@@ -31,10 +29,8 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackageRelationship;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
-import org.apache.poi.openxml4j.opc.internal.FileHelper;
 import org.apache.poi.xwpf.usermodel.XWPFNumbering;
 import org.apache.poi.xwpf.usermodel.XWPFRelation;
-import org.apache.poi.xwpf.usermodel.XWPFStyles;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
@@ -62,12 +58,6 @@ import org.xml.sax.SAXException;
  */
 public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
 
-    private final static String[] EMBEDDED_RELATIONSHIPS = new String[]{
-            RELATION_AUDIO,
-            RELATION_IMAGE,
-            RELATION_PACKAGE,
-            RELATION_OFFICE_DOCUMENT
-    };
 
     //include all parts that might have embedded objects
     private final static String[] MAIN_PART_RELATIONS = new String[]{
@@ -171,7 +161,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
     private void handlePart(PackagePart packagePart, XWPFStylesShim styles,
                             XWPFListManager listManager, XHTMLContentHandler xhtml) throws IOException, SAXException {
 
-        Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart);
+        Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart, true);
         try (InputStream stream = packagePart.getInputStream()) {
             context.getSAXParser().parse(
                     new CloseShieldInputStream(stream),
@@ -185,51 +175,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
 
     }
 
-    private Map<String, String> loadLinkedRelationships(PackagePart bodyPart) {
-        Map<String, String> linkedRelationships = new HashMap<>();
-        try {
-            PackageRelationshipCollection prc = bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
-            for (int i = 0; i < prc.size(); i++) {
-                PackageRelationship pr = prc.getRelationship(i);
-                if (pr == null) {
-                    continue;
-                }
-                String id = pr.getId();
-                String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
-                if (id != null && url != null) {
-                    linkedRelationships.put(id, url);
-                }
-            }
-
-            for (String rel : EMBEDDED_RELATIONSHIPS) {
 
-                prc = bodyPart.getRelationshipsByType(rel);
-                for (int i = 0; i < prc.size(); i++) {
-                    PackageRelationship pr = prc.getRelationship(i);
-                    if (pr == null) {
-                        continue;
-                    }
-                    String id = pr.getId();
-                    String uriString = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
-                    String fileName = uriString;
-                    if (pr.getTargetURI() != null) {
-                        try {
-                            fileName = FileHelper.getFilename(new File(fileName));
-                        } catch (Exception e) {
-                            fileName = uriString;
-                        }
-                    }
-                    if (id != null) {
-                        fileName = (fileName == null) ? "" : fileName;
-                        linkedRelationships.put(id, fileName);
-                    }
-                }
-            }
-
-        } catch (InvalidFormatException e) {
-        }
-        return linkedRelationships;
-    }
 
     private XWPFStylesShim loadStyles(PackagePart packagePart) {
         try {

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFDocumentXMLBodyHandler.java
new file mode 100644
index 0000000..b5aa449
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFDocumentXMLBodyHandler.java
@@ -0,0 +1,330 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xslf;
+
+
+import java.util.Map;
+
+import org.apache.tika.parser.microsoft.ooxml.AbstractDocumentXMLBodyHandler;
+import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
+import org.apache.tika.parser.microsoft.ooxml.RunProperties;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+/**
+ * This class is intended to handle anything that might contain IBodyElements:
+ * main document, headers, footers, notes, etc.
+ */
+
+public class XSLFDocumentXMLBodyHandler extends AbstractDocumentXMLBodyHandler {
+
+
+    private final XSLFBodyContentsHandler bodyContentsHandler;
+    //private final RelationshipsManager relationshipsManager;
+
+
+    //alternate content can be embedded in itself.
+    //need to track depth.
+    //if in alternate, choose fallback, maybe make this configurable?
+    private int inACChoiceDepth = 0;
+    private int inACFallbackDepth = 0;
+
+    private boolean inHyperlink = false;
+
+    private final Map<String, String> linkedRelationships;
+
+    public XSLFDocumentXMLBodyHandler(XSLFBodyContentsHandler bodyContentsHandler,
+                                      Map<String, String> linkedRelationships) {
+        this.bodyContentsHandler = bodyContentsHandler;
+        this.linkedRelationships = linkedRelationships;
+    }
+
+
+    @Override
+    public void startDocument() throws SAXException {
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws SAXException {
+    }
+
+    @Override
+    public void endPrefixMapping(String prefix) throws SAXException {
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+        //TODO: checkBox, textBox, sym, headerReference, footerReference, commentRangeEnd
+
+        if (lastStartElementWasP && ! PPR.equals(localName)) {
+            bodyContentsHandler.startParagraph(currPProperties);
+            pStarted = true;
+        }
+
+        lastStartElementWasP = false;
+
+        if (uri != null && uri.equals(MC_NS)) {
+            if (CHOICE.equals(localName)) {
+                inACChoiceDepth++;
+            } else if (FALLBACK.equals(localName)) {
+                inACFallbackDepth++;
+            }
+        }
+
+        if (inACChoiceDepth > 0) {
+            return;
+        }
+        //these are sorted descending by frequency
+        //in our regression corpus
+        if (RPR.equals(localName)) {
+            inRPr = true;
+        } else if (R.equals(localName)) {
+            inR = true;
+        } else if (T.equals(localName)) {
+            inT = true;
+        } else if (TAB.equals(localName)) {
+            runBuffer.append(TAB_CHAR);
+        } else if (P.equals(localName)) {
+            lastStartElementWasP = true;
+        } else if (B.equals(localName)) { //TODO: add bCs
+            if(inR && inRPr) {
+                currRunProperties.setBold(true);
+            }
+        } else if (TC.equals(localName)) {
+            bodyContentsHandler.startTableCell();
+        } else if (P_STYLE.equals(localName)) {
+            String styleId = atts.getValue(W_NS, "val");
+            currPProperties.setStyleID(styleId);
+        } else if (I.equals(localName)) { //TODO: add iCs
+            //rprs don't have to be inR; ignore those that aren't
+            if (inR && inRPr) {
+                currRunProperties.setItalics(true);
+            }
+        } else if (FLD.equals(localName)) {
+            inR = true;
+        } else if (TR.equals(localName)) {
+            bodyContentsHandler.startTableRow();
+        } else if (NUM_PR.equals(localName)) {
+            inNumPr = true;
+        } else if (ILVL.equals(localName)) {
+            if (inNumPr) {
+                currPProperties.setIlvl(getIntVal(atts));
+            }
+        } else if (NUM_ID.equals(localName)) {
+            if (inNumPr) {
+                currPProperties.setNumId(getIntVal(atts));
+            }
+        } else if(BR.equals(localName)) {
+            runBuffer.append(NEWLINE);
+        } else if ("hlinkClick".equals(localName)) {
+            String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+            String hyperlink = null;
+            if (hyperlinkId != null) {
+                hyperlink = linkedRelationships.get(hyperlinkId);
+                bodyContentsHandler.hyperlinkStart(hyperlink);
+                inHyperlink = true;
+            }/* else {
+                String anchor = atts.getValue(W_NS, "anchor");
+                if (anchor != null) {
+                    anchor = "#" + anchor;
+                }
+                bodyContentsHandler.hyperlinkStart(anchor);
+                inHyperlink = true;
+            }*/
+        } else if(TBL.equals(localName)) {
+            bodyContentsHandler.startTable();
+        } else if (BLIP.equals(localName)) { //check for DRAWING_NS
+            picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
+        } else if ("cNvPr".equals(localName)) { //check for PIC_NS?
+            picDescription = atts.getValue("", "descr");
+        } else if (PIC.equals(localName)) {
+            inPic = true; //check for PIC_NS?
+        } else if (IMAGEDATA.equals(localName)) {
+            picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+            picDescription = atts.getValue(O_NS, "title");
+        } else if (OLE_OBJECT.equals(localName)){ //check for O_NS?
+            String type = null;
+            String refId = null;
+            //TODO: clean this up and ...want to get ProgID?
+            for (int i = 0; i < atts.getLength(); i++) {
+                String attLocalName = atts.getLocalName(i);
+                String attValue = atts.getValue(i);
+                if (attLocalName.equals("Type")) {
+                    type = attValue;
+                } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) && attLocalName.equals("id")) {
+                    refId = attValue;
+                }
+            }
+            if ("Embed".equals(type)) {
+                bodyContentsHandler.embeddedOLERef(refId);
+            }
+        } else if(CR.equals(localName)) {
+            runBuffer.append(NEWLINE);
+        }
+
+    }
+
+
+    private int getIntVal(Attributes atts) {
+        String valString = atts.getValue(W_NS, "val");
+        if (valString != null) {
+            try {
+                return Integer.parseInt(valString);
+            } catch (NumberFormatException e) {
+                //swallow
+            }
+        }
+        return -1;
+    }
+
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+
+        if (CHOICE.equals(localName)) {
+            inACChoiceDepth--;
+        } else if (FALLBACK.equals(localName)) {
+            inACFallbackDepth--;
+        }
+        if (inACChoiceDepth > 0) {
+            return;
+        }
+
+        if (PIC.equals(localName)) { //PIC_NS
+            handlePict();
+            inPic = false;
+            return;
+        } else if (RPR.equals(localName)) {
+            inRPr = false;
+        } else if (R.equals(localName)) {
+            handleEndOfRun();
+        } else if (T.equals(localName)) {
+            inT = false;
+        } else if (PPR.equals(localName)) {
+            if (!pStarted) {
+                bodyContentsHandler.startParagraph(currPProperties);
+                pStarted = true;
+            }
+            currPProperties.reset();
+        } else if (P.equals(localName)) {
+            if (runBuffer.length() > 0) {
+                //<p><tab></p>...this will treat that as if it were
+                //a run...TODO: should we swallow whitespace that doesn't occur in a run?
+                bodyContentsHandler.run(currRunProperties, runBuffer.toString());
+                runBuffer.setLength(0);
+            }
+            pStarted = false;
+            bodyContentsHandler.endParagraph();
+        } else if (TC.equals(localName)) {
+            bodyContentsHandler.endTableCell();
+        } else if (TR.equals(localName)) {
+            bodyContentsHandler.endTableRow();
+        } else if (TBL.equals(localName)) {
+            bodyContentsHandler.endTable();
+        } else if (FLD.equals(localName)) {
+            handleEndOfRun();
+        } else if (HYPERLINK.equals(localName)) {
+            bodyContentsHandler.hyperlinkEnd();
+        } else if (PICT.equals(localName)) {
+            handlePict();
+        }
+    }
+
+    private void handleEndOfRun() {
+        bodyContentsHandler.run(currRunProperties, runBuffer.toString());
+        if (inHyperlink) {
+            bodyContentsHandler.hyperlinkEnd();
+            inHyperlink = false;
+        }
+        inR = false;
+        runBuffer.setLength(0);
+        currRunProperties.setBold(false);
+        currRunProperties.setItalics(false);
+    }
+
+    private void handlePict() {
+        String picFileName = null;
+        if (picRId != null) {
+            picFileName = "picId";//TODO: linkedRelationships.get(picRId);
+        }
+        bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
+        picDescription = null;
+        picRId = null;
+        inPic = false;
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+
+        if (inACChoiceDepth > 0) {
+            return;
+        }
+         if (inT) {
+            runBuffer.append(ch, start, length);
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+        if (inACChoiceDepth > 0) {
+            return;
+        }
+
+        if (inT) {
+            runBuffer.append(ch, start, length);
+        }
+    }
+
+
+    public interface XSLFBodyContentsHandler {
+
+        void run(RunProperties runProperties, String contents);
+
+        /**
+         * @param link the link; can be null
+         */
+        void hyperlinkStart(String link);
+
+        void hyperlinkEnd();
+
+        void startParagraph(ParagraphProperties paragraphProperties);
+
+        void endParagraph();
+
+        void startTable();
+
+        void endTable();
+
+        void startTableRow();
+
+        void endTableRow();
+
+        void startTableCell();
+
+        void endTableCell();
+
+        void embeddedOLERef(String refId);
+
+        void embeddedPicRef(String picFileName, String picDescription);
+
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
new file mode 100644
index 0000000..15bbd6a
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xslf;
+
+import java.io.IOException;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLProperties;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
+import org.apache.tika.parser.microsoft.ooxml.RunProperties;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
+import org.apache.xmlbeans.XmlException;
+
+public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
+
+
+    private OPCPackage container;
+    private POIXMLProperties properties;
+
+    public XSLFEventBasedPowerPointExtractor(String path) throws XmlException, OpenXML4JException, IOException {
+        this(OPCPackage.open(path));
+    }
+
+    public XSLFEventBasedPowerPointExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
+        super((POIXMLDocument) null);
+        this.container = container;
+        this.properties = new POIXMLProperties(container);
+    }
+
+
+    public static void main(String[] args) throws Exception {
+        if (args.length < 1) {
+            System.err.println("Use:");
+            System.err.println("  XSLFEventBasedPowerPointExtractor <filename.pptx>");
+            System.exit(1);
+        }
+
+        XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(args[0]);
+        System.out.println(extractor.getText());
+        extractor.close();
+    }
+
+    public OPCPackage getPackage() {
+        return this.container;
+    }
+
+    public POIXMLProperties.CoreProperties getCoreProperties() {
+        return this.properties.getCoreProperties();
+    }
+
+    public POIXMLProperties.ExtendedProperties getExtendedProperties() {
+        return this.properties.getExtendedProperties();
+    }
+
+    public POIXMLProperties.CustomProperties getCustomProperties() {
+        return this.properties.getCustomProperties();
+    }
+
+
+    @Override
+    public String getText() {
+        //TODO
+        return "";
+    }
+
+
+
+    private class XSLFToTextContentHandler implements XSLFDocumentXMLBodyHandler.XSLFBodyContentsHandler {
+        private final StringBuilder buffer;
+
+        public XSLFToTextContentHandler(StringBuilder buffer) {
+            this.buffer = buffer;
+        }
+
+        @Override
+        public void run(RunProperties runProperties, String contents) {
+            buffer.append(contents);
+        }
+
+        @Override
+        public void hyperlinkStart(String link) {
+            //no-op
+        }
+
+        @Override
+        public void hyperlinkEnd() {
+            //no-op
+        }
+
+        @Override
+        public void startParagraph(ParagraphProperties paragraphProperties) {
+            //no-op
+        }
+
+        @Override
+        public void endParagraph() {
+            buffer.append("\n");
+        }
+
+        @Override
+        public void startTable() {
+
+        }
+
+        @Override
+        public void endTable() {
+
+        }
+
+        @Override
+        public void startTableRow() {
+
+        }
+
+        @Override
+        public void endTableRow() {
+            buffer.append("\n");
+        }
+
+        @Override
+        public void startTableCell() {
+
+        }
+
+        @Override
+        public void endTableCell() {
+            buffer.append("\t");
+        }
+
+
+
+        @Override
+        public void embeddedOLERef(String refId) {
+            //no-op
+        }
+
+        @Override
+        public void embeddedPicRef(String picFileName, String picDescription) {
+            //no-op
+        }
+
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFTikaBodyPartHandler.java
new file mode 100644
index 0000000..ff587f7
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFTikaBodyPartHandler.java
@@ -0,0 +1,262 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xslf;
+
+
+import java.math.BigInteger;
+
+import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
+import org.apache.tika.parser.microsoft.ooxml.RunProperties;
+import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+public class XSLFTikaBodyPartHandler implements XSLFDocumentXMLBodyHandler.XSLFBodyContentsHandler {
+
+    private final static String P = "p";
+
+    private final static char[] NEWLINE = new char[]{'\n'};
+    private final static char[] TAB = new char[]{'\t'};
+
+    private final XHTMLContentHandler xhtml;
+
+    private int pDepth = 0; //paragraph depth
+    private int tableDepth = 0;//table depth
+    private int pWithinCell = 0;//paragraph count within a cell
+    private boolean isItalics = false;
+    private boolean isBold = false;
+    private boolean wroteHyperlinkStart = false;
+    private boolean inTableCell = false;
+
+    public XSLFTikaBodyPartHandler(XHTMLContentHandler xhtml) {
+        this.xhtml = xhtml;
+    }
+
+    @Override
+    public void run(RunProperties runProperties, String contents) {
+        try {
+            // True if we are currently in the named style tag:
+            if (runProperties.getBold() != isBold) {
+                if (isItalics) {
+                    xhtml.endElement("i");
+                    isItalics = false;
+                }
+                if (runProperties.getBold()) {
+                    xhtml.startElement("b");
+                    isBold = true;
+                } else {
+                    xhtml.endElement("b");
+                    isBold = false;
+                }
+            }
+
+            if (runProperties.getItalics() != isItalics) {
+                if (runProperties.getItalics()) {
+                    xhtml.startElement("i");
+                    isItalics = true;
+                } else {
+                    xhtml.endElement("i");
+                    isItalics = false;
+                }
+            }
+
+            xhtml.characters(contents);
+
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void hyperlinkStart(String link) {
+        try {
+            if (link != null) {
+                xhtml.startElement("a", "href", link);
+                wroteHyperlinkStart = true;
+            }
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void hyperlinkEnd() {
+        try {
+            if (wroteHyperlinkStart) {
+                closeStyleTags();
+                wroteHyperlinkStart = false;
+                xhtml.endElement("a");
+            }
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void startParagraph(ParagraphProperties paragraphProperties) {
+        if (pDepth == 0 && tableDepth == 0) {
+            try {
+                xhtml.startElement(P);
+            } catch (SAXException e) {
+
+            }
+        }
+        pDepth++;
+    }
+
+    @Override
+    public void endParagraph() {
+        try {
+            closeStyleTags();
+            if (pDepth == 1 && tableDepth == 0) {
+                xhtml.endElement(P);
+            } else if (pWithinCell > 0){
+                xhtml.characters(NEWLINE, 0, 1);
+            }
+        } catch (SAXException e) {
+
+        }
+        if (inTableCell) {
+            pWithinCell++;
+        }
+        pDepth--;
+    }
+
+    @Override
+    public void startTable() {
+        try {
+            xhtml.startElement("table");
+            tableDepth++;
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void endTable() {
+        try {
+            xhtml.endElement("table");
+            tableDepth--;
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void startTableRow() {
+        try {
+            xhtml.startElement("tr");
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void endTableRow() {
+        try {
+            xhtml.endElement("tr");
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void startTableCell() {
+        try {
+            xhtml.startElement("td");
+        } catch (SAXException e) {
+
+        }
+        inTableCell = true;
+    }
+
+    @Override
+    public void endTableCell() {
+        try {
+            xhtml.endElement("td");
+        } catch (SAXException e) {
+
+        }
+        inTableCell = false;
+        pWithinCell = 0;
+    }
+
+
+    @Override
+    public void embeddedOLERef(String relId) {
+        if (relId == null) {
+            return;
+        }
+        try {
+            AttributesImpl attributes = new AttributesImpl();
+            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+            attributes.addAttribute("", "id", "id", "CDATA", relId);
+            xhtml.startElement("div", attributes);
+            xhtml.endElement("div");
+
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void embeddedPicRef(String picFileName, String picDescription) {
+
+        try {
+            AttributesImpl attr = new AttributesImpl();
+            if (picFileName != null) {
+                attr.addAttribute("", "src", "src", "CDATA", "embedded:" + picFileName);
+            }
+            if (picDescription != null) {
+                attr.addAttribute("", "alt", "alt", "CDATA", picDescription);
+            }
+
+            xhtml.startElement("img", attr);
+            xhtml.endElement("img");
+
+        } catch (SAXException e) {
+
+        }
+    }
+
+    private void closeStyleTags() throws SAXException {
+        if (isItalics) {
+            xhtml.endElement("i");
+            isItalics = false;
+        }
+        if (isBold) {
+            xhtml.endElement("b");
+            isBold = false;
+        }
+    }
+
+    private void writeParagraphNumber(int numId, int ilvl,
+                                      XWPFListManager listManager,
+                                      XHTMLContentHandler xhtml) throws SAXException {
+
+        if (ilvl < 0 || numId < 0 || listManager == null) {
+            return;
+        }
+        String number = listManager.getFormattedNumber(BigInteger.valueOf(numId), ilvl);
+        if (number != null) {
+            xhtml.characters(number);
+        }
+
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
index 610a2cb..d08fb07 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
@@ -21,17 +21,19 @@ package org.apache.tika.parser.microsoft.ooxml.xwpf;
 import java.util.Date;
 import java.util.Map;
 
+import org.apache.tika.parser.microsoft.ooxml.AbstractDocumentXMLBodyHandler;
+import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
+import org.apache.tika.parser.microsoft.ooxml.RunProperties;
 import org.apache.tika.utils.DateUtils;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
 
 /**
  * This class is intended to handle anything that might contain IBodyElements:
  * main document, headers, footers, notes, etc.
  */
 
-public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
+public class XWPFDocumentXMLBodyHandler extends AbstractDocumentXMLBodyHandler {
 
 
     enum EditType {
@@ -43,86 +45,24 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
     }
 
 
-    final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
-    private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
-    private final static String O_NS = "urn:schemas-microsoft-com:office:office";
-    private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
-    private final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
-    private static final String V_NS = "urn:schemas-microsoft-com:vml";
-
-    private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
-
-    private final static char[] TAB_CHAR = new char[1];
-    private final static char NEWLINE = '\n';
-
-    static {
-        TAB_CHAR[0] = '\t';
-    }
-
-    private final static String R = "r";
-    private final static String RPR = "rPr";
-    private final static String P = "p";
-    private final static String P_STYLE = "pStyle";
-    private final static String PPR = "pPr";
-    private final static String T = "t";
-    private final static String TAB = "tab";
-    private final static String B = "b";
-    private final static String ILVL = "ilvl";
-    private final static String NUM_ID = "numId";
-    private final static String TC = "tc";
-    private final static String TR = "tr";
-    private final static String I = "i";
-    private final static String NUM_PR = "numPr";
-    private final static String BR = "br";
     private final static String BOOKMARK_START = "bookmarkStart";
     private final static String BOOKMARK_END = "bookmarkEnd";
-    private final static String HYPERLINK = "hyperlink";
-    private final static String TBL = "tbl";
-    private final static String PIC = "pic";
-    private final static String PICT = "pict";
     private final static String FOOTNOTE_REFERENCE = "footnoteReference";
-    private final static String IMAGEDATA = "imagedata";
-    private final static String BLIP = "blip";
     private final static String INS = "ins";
     private final static String DEL = "del";
     private final static String DEL_TEXT = "delText";
     private final static String MOVE_FROM = "moveFrom";
     private final static String MOVE_TO = "moveTo";
-    private final static String OLE_OBJECT = "OLEObject";
-    private final static String CR = "cr";
     private final static String ENDNOTE_REFERENCE = "endnoteReference";
-    private final static String CHOICE = "Choice";
-    private final static String FALLBACK = "Fallback";
 
     private final XWPFBodyContentsHandler bodyContentsHandler;
     //private final RelationshipsManager relationshipsManager;
     private final Map<String, String> linkedRelationships;
 
-    private final StringBuilder runBuffer = new StringBuilder();
-
-    private boolean inR = false;
-    private boolean inT = false;
-    private int pDepth = 0;
-    private boolean inRPr = false;
-    private boolean inNumPr = false;
     private boolean inDelText = false;
 
-    private boolean inPic = false;
-    private boolean inPict = false;
-    private String picDescription = null;
-    private String picRId = null;
-    private String picFilename = null;
-    private boolean lastStartElementWasP;
-
-    //alternate content can be embedded in itself.
-    //need to track depth.
-    //if in alternate, choose fallback, maybe make this configurable?
-    private int inACChoiceDepth = 0;
-    private int inACFallbackDepth = 0;
-    private EditType editType = EditType.NONE;
+    private XWPFDocumentXMLBodyHandler.EditType editType = XWPFDocumentXMLBodyHandler.EditType.NONE;
 
-    private XWPFRunProperties currRunProperties = new XWPFRunProperties();
-    private XWPFParagraphProperties currPProperties = new XWPFParagraphProperties();
 
     public XWPFDocumentXMLBodyHandler(XWPFBodyContentsHandler bodyContentsHandler,
                                       Map<String, String> hyperlinks) {
@@ -180,7 +120,6 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
             runBuffer.append(TAB_CHAR);
         } else if (P.equals(localName)) {
             lastStartElementWasP = true;
-            pDepth++;
         } else if (B.equals(localName)) { //TODO: add bCs
             if(inR && inRPr) {
                 currRunProperties.setBold(true);
@@ -334,7 +273,6 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
             currPProperties.reset();
         } else if (P.equals(localName)) {
             bodyContentsHandler.endParagraph();
-            pDepth--;
         } else if (TC.equals(localName)) {
             bodyContentsHandler.endTableCell();
         } else if (TR.equals(localName)) {
@@ -398,7 +336,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
 
     public interface XWPFBodyContentsHandler {
 
-        void run(XWPFRunProperties runProperties, String contents);
+        void run(RunProperties runProperties, String contents);
 
         /**
          * @param link the link; can be null
@@ -407,7 +345,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
 
         void hyperlinkEnd();
 
-        void startParagraph(XWPFParagraphProperties paragraphProperties);
+        void startParagraph(ParagraphProperties paragraphProperties);
 
         void endParagraph();
 

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 3cae6d9..f61fa56 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -38,6 +38,8 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.util.SAXHelper;
 import org.apache.poi.xwpf.usermodel.XWPFNumbering;
 import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
+import org.apache.tika.parser.microsoft.ooxml.RunProperties;
 import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
 import org.apache.xmlbeans.XmlException;
 import org.xml.sax.InputSource;
@@ -209,29 +211,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         }
         return hyperlinks;
     }
-/*
-    private XWPFStyles loadStyles(PackagePart packagePart) {
-        try {
-            PackageRelationshipCollection stylesParts =
-                    packagePart.getRelationshipsByType(XWPFRelation.STYLES.getRelation());
-            if (stylesParts.size() > 0) {
-                PackageRelationship stylesRelationShip = stylesParts.getRelationship(0);
-                if (stylesRelationShip == null) {
-                    return null;
-                }
-                PackagePart stylesPart = opcPackage.getPart(stylesRelationShip);
-                if (stylesPart == null) {
-                    return null;
-                }
-                return new XWPFStyles(stylesPart);
-            }
-        } catch (IOException|OpenXML4JException e) {
-            //swallow
-        }
-        return null;
 
-    }
-*/
     private XWPFNumbering loadNumbering(PackagePart packagePart) {
         try {
             PackageRelationshipCollection numberingParts = packagePart.getRelationshipsByType(XWPFRelation.NUMBERING.getRelation());
@@ -260,7 +240,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         }
 
         @Override
-        public void run(XWPFRunProperties runProperties, String contents) {
+        public void run(RunProperties runProperties, String contents) {
             buffer.append(contents);
         }
 
@@ -275,7 +255,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
         }
 
         @Override
-        public void startParagraph(XWPFParagraphProperties paragraphProperties) {
+        public void startParagraph(ParagraphProperties paragraphProperties) {
             //no-op
         }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFParagraphProperties.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFParagraphProperties.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFParagraphProperties.java
deleted file mode 100644
index fd2b022..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFParagraphProperties.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-public class XWPFParagraphProperties {
-
-    private String styleId;
-    private int ilvl = -1;
-    private int numId = -1;
-
-    String getStyleID() {
-        return  styleId;
-    }
-
-    void setStyleID(String styleId) {
-        this.styleId = styleId;
-    }
-
-    void reset() {
-        styleId = null;
-        ilvl = -1;
-        numId = -1;
-    }
-
-    public void setIlvl(int ilvl) {
-        this.ilvl = ilvl;
-    }
-
-    public void setNumId(int numId) {
-        this.numId = numId;
-    }
-
-    public int getIlvl() {
-        return ilvl;
-    }
-
-    public int getNumId() {
-        return numId;
-    }
-}