You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/11/30 20:31:16 UTC

[7/7] tika git commit: TIKA 1321 initial commit

TIKA 1321 initial commit


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/32162f59
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/32162f59
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/32162f59

Branch: refs/heads/2.x
Commit: 32162f59ee6c0fdea40f0231cd9aff43519f2a7a
Parents: de103c8
Author: tballison <ta...@mitre.org>
Authored: Wed Nov 30 15:30:57 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Nov 30 15:30:57 2016 -0500

----------------------------------------------------------------------
 CHANGES.txt                                     |    3 +
 .../java/org/apache/tika/utils/DateUtils.java   |   72 +-
 .../parser/microsoft/AbstractOfficeParser.java  |   68 +
 .../parser/microsoft/MSOfficeParserConfig.java  |   38 -
 .../tika/parser/microsoft/OfficeParser.java     |    5 +-
 .../parser/microsoft/OfficeParserConfig.java    |   82 +
 .../microsoft/ooxml/MetadataExtractor.java      |    4 +-
 .../microsoft/ooxml/OOXMLExtractorFactory.java  |   36 +-
 .../parser/microsoft/ooxml/OOXMLParser.java     |    6 +-
 .../ooxml/SXWPFWordExtractorDecorator.java      |  224 ++
 .../parser/microsoft/ooxml/XWPFListManager.java |   25 +-
 .../ooxml/XWPFWordExtractorDecorator.java       |    2 +-
 .../microsoft/ooxml/xwpf/BinaryDataHandler.java |  120 -
 .../ooxml/xwpf/BodyContentHandler.java          |  271 --
 .../ooxml/xwpf/CorePropertiesHandler.java       |  144 -
 .../ooxml/xwpf/ExtendedPropertiesHandler.java   |   67 -
 .../microsoft/ooxml/xwpf/PartHandler.java       |   43 -
 .../microsoft/ooxml/xwpf/Relationship.java      |   52 -
 .../ooxml/xwpf/RelationshipsHandler.java        |   86 -
 .../ooxml/xwpf/RelationshipsManager.java        |   58 -
 .../microsoft/ooxml/xwpf/Word2006MLHandler.java |  168 -
 .../microsoft/ooxml/xwpf/Word2006MLParser.java  |   67 -
 .../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java  |  318 ++
 .../ooxml/xwpf/XWPFEventBasedWordExtractor.java |  353 ++
 .../microsoft/ooxml/xwpf/XWPFRunProperties.java |   44 +
 .../ooxml/xwpf/XWPFTikaBodyPartHandler.java     |  224 ++
 .../ooxml/xwpf/ml2006/AbstractPartHandler.java  |   43 +
 .../ooxml/xwpf/ml2006/BinaryDataHandler.java    |  120 +
 .../ooxml/xwpf/ml2006/BodyPartHandler.java      |   64 +
 .../xwpf/ml2006/CorePropertiesHandler.java      |  144 +
 .../xwpf/ml2006/ExtendedPropertiesHandler.java  |   67 +
 .../ooxml/xwpf/ml2006/PartHandler.java          |   34 +
 .../ooxml/xwpf/ml2006/Relationship.java         |   52 +
 .../ooxml/xwpf/ml2006/RelationshipsHandler.java |   86 +
 .../ooxml/xwpf/ml2006/RelationshipsManager.java |   58 +
 .../ooxml/xwpf/ml2006/Word2006MLDocHandler.java |  171 +
 .../ooxml/xwpf/ml2006/Word2006MLParser.java     |   71 +
 .../services/org.apache.tika.parser.Parser      |    2 +-
 .../parser/microsoft/ooxml/OOXMLParserTest.java |   31 +
 .../ooxml/xwpf/Word2006MLParserTest.java        |  182 -
 .../ooxml/xwpf/ml2006/Word2006MLParserTest.java |  171 +
 .../ooxml/xwpf/SXWPFExtractorTest.java          |  187 +
 .../test-documents/testWORD_2003ml.xml          | 2058 +++++++---
 .../test-documents/testWORD_2006ml.docx         |  Bin 0 -> 165566 bytes
 .../test-documents/testWORD_2006ml.xml          | 3621 +++++++++++++++---
 .../test-documents/testWORD_2006ml_src.docx     |  Bin 99960 -> 0 bytes
 46 files changed, 7351 insertions(+), 2391 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 82c29e2..3e3ef8b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,9 @@ Release 2.0 - ???
 
 Release 1.15 -???
 
+  * Added experimental SAX parser for .docx files. To select this parser,
+    set useSAXDocxExtractor(true) on OfficeParserConfig (TIKA-1321).
+
   * Change default behavior to parse embedded documents even if the user
     forgets to specify a Parser.class in the ParseContext (TIKA-2096).
     Users who wish to parse only the container document should set

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java b/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
index 6b764c8..6ccc74e 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
@@ -16,9 +16,14 @@
  */
 package org.apache.tika.utils;
 
+import java.text.DateFormat;
+import java.text.DateFormatSymbols;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
 import java.util.Calendar;
 import java.util.Date;
 import java.util.GregorianCalendar;
+import java.util.List;
 import java.util.Locale;
 import java.util.TimeZone;
 
@@ -26,6 +31,7 @@ import java.util.TimeZone;
  * Date related utility methods and constants
  */
 public class DateUtils {
+
     /**
      * The UTC time zone. Not sure if {@link TimeZone#getTimeZone(String)}
      * understands "UTC" in all environments, but it'll fall back to GMT
@@ -44,8 +50,49 @@ public class DateUtils {
      */
     public static final TimeZone MIDDAY = TimeZone.getTimeZone("GMT-12:00");
 
+    private static ThreadLocal<DateFormat> createDateFormat(String format, TimeZone timezone) {
+        final SimpleDateFormat sdf =
+                new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
+        if (timezone != null) {
+            sdf.setTimeZone(timezone);
+        }
+        return new ThreadLocal<DateFormat>() {
+            @Override
+            public DateFormat initialValue() {
+                return sdf;
+            }
+        };
+    }
+
+    /**
+     * Some parsers will have the date as a ISO-8601 string
+     *  already, and will set that into the Metadata object.
+     * So we can return Date objects for these, this is the
+     *  list (in preference order) of the various ISO-8601
+     *  variants that we try when processing a date based
+     *  property.
+     */
+    private static final List<ThreadLocal<DateFormat>> ISO_8601_INPUT_FORMATS = loadDateFormats();
+
+    private static List<ThreadLocal<DateFormat>> loadDateFormats() {
+        List<ThreadLocal<DateFormat>> dateFormats = new ArrayList<>();
+        // yyyy-mm-ddThh...
+        dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", UTC));   // UTC/Zulu
+        dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", null));    // With timezone
+        dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss", null));     // Without timezone
+        // yyyy-mm-dd hh...
+        dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss'Z'", UTC));   // UTC/Zulu
+        dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ssZ", null));    // With timezone
+        dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss", null));     // Without timezone
+        // Date without time, set to Midday UTC
+        dateFormats.add(createDateFormat("yyyy-MM-dd", MIDDAY));       // Normal date format
+        dateFormats.add(createDateFormat("yyyy:MM:dd", MIDDAY));              // Image (IPTC/EXIF) format
+
+        return dateFormats;
+    }
+
     /**
-     * Returns a ISO 8601 representation of the given date. This method 
+     * Returns a ISO 8601 representation of the given date. This method
      * is thread safe and non-blocking.
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a>
@@ -58,7 +105,7 @@ public class DateUtils {
         return doFormatDate(calendar);
     }
     /**
-     * Returns a ISO 8601 representation of the given date. This method 
+     * Returns a ISO 8601 representation of the given date. This method
      * is thread safe and non-blocking.
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a>
@@ -66,7 +113,7 @@ public class DateUtils {
      * @return ISO 8601 date string, including timezone details
      */
     public static String formatDate(Calendar date) {
-        // Explicitly switch it into UTC before formatting 
+        // Explicitly switch it into UTC before formatting
         date.setTimeZone(UTC);
         return doFormatDate(date);
     }
@@ -98,4 +145,23 @@ public class DateUtils {
                 calendar.get(Calendar.MINUTE),
                 calendar.get(Calendar.SECOND));
     }
+
+    /**
+     * Tries to parse the date string; returns null if no parse was possible.
+     *
+     * This is thread safe because it relies on threadlocal dateformats.
+     *
+     * @param dateString
+     * @return
+     */
+    public static Date tryToParse(String dateString) {
+        for (ThreadLocal<DateFormat> df : ISO_8601_INPUT_FORMATS) {
+            try {
+                return df.get().parse(dateString);
+            } catch (java.text.ParseException e){
+
+            }
+        }
+        return null;
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
new file mode 100644
index 0000000..2538219
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -0,0 +1,68 @@
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Intermediate layer to set {@link OfficeParserConfig} uniformly.
+ */
+public abstract class AbstractOfficeParser extends AbstractParser {
+
+    private final OfficeParserConfig defaultOfficeParserConfig = new OfficeParserConfig();
+
+    /**
+     * Checks to see if the user has specified an {@link OfficeParserConfig}.
+     * If so, no changes are made; if not, one is added to the context.
+     *
+     * @param parseContext
+     */
+    public void configure(ParseContext parseContext) {
+        OfficeParserConfig officeParserConfig = parseContext.get(OfficeParserConfig.class, defaultOfficeParserConfig);
+        parseContext.set(OfficeParserConfig.class, officeParserConfig);
+    }
+
+    /**
+     * @see OfficeParserConfig#getIncludeDeletedContent
+     *
+     * @return
+     */
+    public boolean getIncludeDeletedContent() {
+        return defaultOfficeParserConfig.getIncludeDeletedContent();
+    }
+
+    /**
+     * @see OfficeParserConfig#getIncludeMoveFromContent()
+     *
+     * @return
+     */
+
+    public boolean getIncludeMoveFromContent() {
+        return defaultOfficeParserConfig.getIncludeMoveFromContent();
+    }
+
+    /**
+     * @see OfficeParserConfig#getUseSAXDocxExtractor()
+     *
+     * @return
+     */
+    public boolean getUseSAXDocxExtractor() {
+        return defaultOfficeParserConfig.getUseSAXDocxExtractor();
+    }
+
+
+//    @Field
+    public void setIncludeDeletedContent(boolean includeDeletedConent) {
+        defaultOfficeParserConfig.setIncludeDeletedContent(includeDeletedConent);
+    }
+
+//    @Field
+    public void setIncludeMoveFromContent(boolean includeMoveFromContent) {
+        defaultOfficeParserConfig.setIncludeMoveFromContent(includeMoveFromContent);
+    }
+
+//    @Field
+    public void setUseSAXDocxExtractor(boolean useSAXDocxExtractor) {
+        defaultOfficeParserConfig.setUseSAXDocxExtractor(useSAXDocxExtractor);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/MSOfficeParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/MSOfficeParserConfig.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/MSOfficeParserConfig.java
deleted file mode 100644
index 8f8086a..0000000
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/MSOfficeParserConfig.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-
-public class MSOfficeParserConfig {
-
-    private boolean includeDeletedContent = true;
-
-    /**
-     * Sets whether or not the parser should include deleted content.
-     * <b>This has not been implemented in all MSOffice parsers yet!!!</b>
-     * @param includeDeletedContent
-     */
-    public void setIncludeDeletedContent(boolean includeDeletedContent) {
-        this.includeDeletedContent = includeDeletedContent;
-    }
-
-    public boolean getIncludeDeletedContent() {
-        return includeDeletedContent;
-    }
-}
-
-

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index 5218dfa..7e21ba8 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -48,7 +48,6 @@ import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
@@ -61,7 +60,7 @@ import org.xml.sax.SAXException;
 /**
  * Defines a Microsoft document content extractor.
  */
-public class OfficeParser extends AbstractParser {
+public class OfficeParser extends AbstractOfficeParser {
 
     /**
      * Serial version UID
@@ -98,6 +97,8 @@ public class OfficeParser extends AbstractParser {
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
+
+        configure(context);
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
 

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
new file mode 100644
index 0000000..55f4673
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+
+import java.io.Serializable;
+
+public class OfficeParserConfig implements Serializable {
+
+    private boolean includeDeletedContent = true;
+    private boolean includeMoveFromContent = false;
+
+    private boolean useSAXDocxExtractor = false;
+
+    /**
+     * Sets whether or not the parser should include deleted content.
+     * <p/>
+     * <b>This has only been implemented in the streaming docx parser
+     * ({@link org.apache.tika.parser.microsoft.ooxml.SXWPFWordExtractorDecorator} so far!!!</b>
+     * @param includeDeletedContent
+     */
+    public void setIncludeDeletedContent(boolean includeDeletedContent) {
+        this.includeDeletedContent = includeDeletedContent;
+    }
+
+    public boolean getIncludeDeletedContent() {
+        return includeDeletedContent;
+    }
+
+    /**
+     * With track changes on, when a section is moved, the content
+     * is stored in both the "moveFrom" section and in the "moveTo" section.
+     * <p/>
+     * If you'd like to include the section both in its original location (moveFrom)
+     * and in its new location (moveTo), set this to <code>true</code>
+     * <p/>
+     * Default: <code>false</code>
+     * <p/>
+     * <b>This has only been implemented in the streaming docx parser
+     * ({@link org.apache.tika.parser.microsoft.ooxml.SXWPFWordExtractorDecorator} so far!!!</b>
+     * @param includeMoveFromContent
+     */
+    public void setIncludeMoveFromContent(boolean includeMoveFromContent) {
+        this.includeMoveFromContent = includeMoveFromContent;
+    }
+
+    public boolean getIncludeMoveFromContent() {
+        return includeMoveFromContent;
+    }
+
+    public boolean getUseSAXDocxExtractor() {
+        return useSAXDocxExtractor;
+    }
+
+    /**
+     * Use the experimental SAX-based streaming DOCX parser?
+     * If set to <code>false</code>, the classic parser will be used; if <code>true</code>,
+     * the new experimental parser will be used.
+     * <p/>
+     * Default: classic parser
+     * @param useSAXDocxExtractor
+     */
+    public void setUseSAXDocxExtractor(boolean useSAXDocxExtractor) {
+        this.useSAXDocxExtractor = useSAXDocxExtractor;
+    }
+}
+
+

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index 91d49c7..d392346 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -36,6 +36,7 @@ import org.apache.tika.metadata.PagedText;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.microsoft.SummaryExtractor;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
 import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
 import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
 import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
@@ -57,7 +58,8 @@ public class MetadataExtractor {
 
     public void extract(Metadata metadata) throws TikaException {
         if (extractor.getDocument() != null ||
-                (extractor instanceof XSSFEventBasedExcelExtractor &&
+                ((extractor instanceof XSSFEventBasedExcelExtractor ||
+                        extractor instanceof XWPFEventBasedWordExtractor) &&
                         extractor.getPackage() != null)) {
             extractMetadata(extractor.getCoreProperties(), metadata);
             extractMetadata(extractor.getExtendedProperties(), metadata);

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 4ba99f2..e6f30c7 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -28,17 +28,22 @@ import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackageAccess;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
 import org.apache.poi.xslf.usermodel.XMLSlideShow;
 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
 import org.apache.tika.parser.opc.OPCDetector;
 import org.apache.xmlbeans.XmlException;
 import org.xml.sax.ContentHandler;
@@ -83,12 +88,22 @@ public class OOXMLExtractorFactory {
             metadata.set(Metadata.CONTENT_TYPE, type.toString());
 
             // Have the appropriate OOXML text extractor picked
-            POIXMLTextExtractor poiExtractor = ExtractorFactory.createExtractor(pkg);
+            POIXMLTextExtractor poiExtractor = null;
+            OfficeParserConfig config = context.get(OfficeParserConfig.class, new OfficeParserConfig());
+            if (config.getUseSAXDocxExtractor()) {
+                poiExtractor = trySXWPF(pkg);
+            }
+            if (poiExtractor == null) {
+                poiExtractor = ExtractorFactory.createExtractor(pkg);
+            }
 
             POIXMLDocument document = poiExtractor.getDocument();
             if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
                 extractor = new XSSFExcelExtractorDecorator(
                         context, (XSSFEventBasedExcelExtractor) poiExtractor, locale);
+            } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
+                extractor = new SXWPFWordExtractorDecorator(context,
+                        (XWPFEventBasedWordExtractor)poiExtractor);
             } else if (document == null) {
                 throw new TikaException(
                         "Expecting UserModel based POI OOXML extractor with a document, but none found. " +
@@ -129,4 +144,23 @@ public class OOXMLExtractorFactory {
         }
     }
 
+    private static POIXMLTextExtractor trySXWPF(OPCPackage pkg) throws XmlException, OpenXML4JException, IOException {
+        PackageRelationshipCollection packageRelationshipCollection = pkg.getRelationshipsByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument");
+        if(packageRelationshipCollection.size() == 0) {
+            packageRelationshipCollection = pkg.getRelationshipsByType("http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument");
+        }
+
+        if (packageRelationshipCollection.size() == 0) {
+            return null;
+        }
+        PackagePart corePart = pkg.getPart(packageRelationshipCollection.getRelationship(0));
+        String targetContentType = corePart.getContentType();
+        for (XWPFRelation relation : XWPFWordExtractor.SUPPORTED_TYPES) {
+            if (targetContentType.equals(relation.getContentType())) {
+                return new XWPFEventBasedWordExtractor(pkg);
+            }
+        }
+        return null;
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
index 22f2cac..a6d0b34 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
@@ -27,15 +27,15 @@ import org.apache.poi.openxml4j.util.ZipSecureFile;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.AbstractOfficeParser;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 /**
  * Office Open XML (OOXML) parser.
  */
-public class OOXMLParser extends AbstractParser {
+public class OOXMLParser extends AbstractOfficeParser {
     static {
         //turn off POI's zip bomb detection because we have our own
         ZipSecureFile.setMinInflateRatio(-1.0d);
@@ -83,6 +83,8 @@ public class OOXMLParser extends AbstractParser {
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
+        //set OfficeParserConfig if the user hasn't specified one
+        configure(context);
         // Have the OOXML file processed
         OOXMLExtractorFactory.parse(stream, handler, metadata, context);
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
new file mode 100644
index 0000000..ce33c08
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.xwpf.usermodel.XWPFNumbering;
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor;
+import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFDocumentXMLBodyHandler;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFTikaBodyPartHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.SAXException;
+
+/**
+ * This is an experimental, alternative extractor for docx files.
+ * This streams the main document content rather than loading the
+ * full document into memory.
+ * <p>
+ * This will be better for some use cases than the classic docx extractor; and,
+ * it will be worse for others.
+ * </p>
+ *
+ * @since 1.15
+ */
+public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
+
+
+    private final OPCPackage opcPackage;
+    private final ParseContext context;
+
+    public SXWPFWordExtractorDecorator(ParseContext context,
+                                       XWPFEventBasedWordExtractor extractor) {
+        super(context, extractor);
+        this.context = context;
+        this.opcPackage = extractor.getPackage();
+    }
+
+
+    @Override
+    protected void buildXHTML(XHTMLContentHandler xhtml)
+            throws SAXException, XmlException, IOException {
+        //handle main document
+        List<PackagePart> pps = opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+        if (pps != null) {
+            for (PackagePart pp : pps) {
+                //likely only one, but why not...
+                handleDocumentPart(pp, xhtml);
+            }
+        }
+        //handle glossary document
+        pps = opcPackage.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
+
+        if (pps != null) {
+            for (PackagePart pp : pps) {
+                //likely only one, but why not...
+                handleDocumentPart(pp, xhtml);
+            }
+        }
+    }
+
+    private void handleDocumentPart(PackagePart documentPart, XHTMLContentHandler xhtml) throws IOException, SAXException {
+        //load the numbering/list manager and styles from the main document part
+        XWPFNumbering numbering = loadNumbering(documentPart);
+        XWPFListManager xwpfListManager = new XWPFListManager(numbering);
+        //TODO: XWPFStyles styles = loadStyles(documentPart);
+
+        //headers
+        try {
+            PackageRelationshipCollection headersPRC = documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation());
+            if (headersPRC != null) {
+                for (int i = 0; i < headersPRC.size(); i++) {
+                    PackagePart header = documentPart.getRelatedPart(headersPRC.getRelationship(i));
+                    handlePart(header, xwpfListManager, xhtml);
+                }
+            }
+        } catch (InvalidFormatException e) {
+            //swallow
+        }
+
+        //main document
+        handlePart(documentPart, xwpfListManager, xhtml);
+
+        //for now, just dump other components at end
+        for (XWPFRelation rel : new XWPFRelation[]{
+                XWPFRelation.FOOTNOTE,
+                XWPFRelation.COMMENT,
+                XWPFRelation.FOOTER,
+                XWPFRelation.ENDNOTE
+        }) {
+            try {
+                PackageRelationshipCollection prc = documentPart.getRelationshipsByType(rel.getRelation());
+                if (prc != null) {
+                    for (int i = 0; i < prc.size(); i++) {
+                        PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
+                        handlePart(packagePart, xwpfListManager, xhtml);
+                    }
+                }
+            } catch (InvalidFormatException e) {
+                //swallow
+            }
+        }
+    }
+
+    private void handlePart(PackagePart packagePart,
+                            XWPFListManager xwpfListManager, XHTMLContentHandler xhtml) throws IOException, SAXException {
+
+        Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
+        try (InputStream stream = packagePart.getInputStream()) {
+            context.getSAXParser().parse(
+                    new CloseShieldInputStream(stream),
+                    new OfflineContentHandler(new EmbeddedContentHandler(
+                            new XWPFDocumentXMLBodyHandler(
+                                    new XWPFTikaBodyPartHandler(xhtml, xwpfListManager,
+                                            context.get(OfficeParserConfig.class)), hyperlinks))));
+        } catch (TikaException e) {
+            e.printStackTrace();
+        }
+
+    }
+
+    private Map<String, String> loadHyperlinkRelationships(PackagePart bodyPart) {
+        Map<String, String> hyperlinks = new HashMap<>();
+        try {
+            PackageRelationshipCollection prc = bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
+            for (int i = 0; i < prc.size(); i++) {
+                PackageRelationship pr = prc.getRelationship(i);
+                if (pr == null) {
+                    continue;
+                }
+                String id = pr.getId();
+                String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
+                if (id != null && url != null) {
+                    hyperlinks.put(id, url);
+                }
+            }
+        } catch (InvalidFormatException e) {
+        }
+        return hyperlinks;
+    }
+/*
+    private XWPFStyles loadStyles(PackagePart packagePart) {
+        try {
+            PackageRelationshipCollection stylesParts =
+                    packagePart.getRelationshipsByType(XWPFRelation.STYLES.getRelation());
+            if (stylesParts.size() > 0) {
+                PackageRelationship stylesRelationShip = stylesParts.getRelationship(0);
+                if (stylesRelationShip == null) {
+                    return null;
+                }
+                PackagePart stylesPart = opcPackage.getPart(stylesRelationShip);
+                if (stylesPart == null) {
+                    return null;
+                }
+                return new XWPFStyles(stylesPart);
+            }
+        } catch (IOException|OpenXML4JException e) {
+            //swallow
+        }
+        return null;
+
+    }
+*/
+    private XWPFNumbering loadNumbering(PackagePart packagePart) {
+        try {
+            PackageRelationshipCollection numberingParts = packagePart.getRelationshipsByType(XWPFRelation.NUMBERING.getRelation());
+            if (numberingParts.size() > 0) {
+                PackageRelationship numberingRelationShip = numberingParts.getRelationship(0);
+                if (numberingRelationShip == null) {
+                    return null;
+                }
+                PackagePart numberingPart = opcPackage.getPart(numberingRelationShip);
+                if (numberingPart == null) {
+                    return null;
+                }
+                return new XWPFNumbering(numberingPart);
+            }
+        } catch (IOException | OpenXML4JException e) {
+            //swallow
+        }
+        return null;
+    }
+
+    /**
+     * This returns the main document only.
+     */
+    @Override
+    protected List<PackagePart> getMainDocumentParts() {
+        return opcPackage.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
index ffbb167..d51f2e9 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
@@ -16,8 +16,9 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
+import java.math.BigInteger;
+
 import org.apache.poi.xwpf.usermodel.XWPFAbstractNum;
-import org.apache.poi.xwpf.usermodel.XWPFDocument;
 import org.apache.poi.xwpf.usermodel.XWPFNum;
 import org.apache.poi.xwpf.usermodel.XWPFNumbering;
 import org.apache.poi.xwpf.usermodel.XWPFParagraph;
@@ -47,8 +48,8 @@ public class XWPFListManager extends AbstractListManager {
     private final XWPFNumbering numbering;
 
     //map of numId (which paragraph series is this a member of?), levelcounts
-    public XWPFListManager(XWPFDocument document) {
-        numbering = document.getNumbering();
+    public XWPFListManager(XWPFNumbering numbering) {
+        this.numbering = numbering;
     }
 
     /**
@@ -57,11 +58,18 @@ public class XWPFListManager extends AbstractListManager {
      * @return the formatted number or an empty string if something went wrong
      */
     public String getFormattedNumber(final XWPFParagraph paragraph) {
-        if (numbering == null) {
+        return getFormattedNumber(paragraph.getNumID(),
+                paragraph.getNumIlvl() == null ? -1 : paragraph.getNumIlvl().intValue());
+    }
+
+    public String getFormattedNumber(BigInteger numId, int iLvl) {
+        if (numbering == null || iLvl < 0 || numId == null) {
             return "";
         }
-        int currNumId = paragraph.getNumID().intValue();
-        XWPFNum xwpfNum = numbering.getNum(paragraph.getNumID());
+        int currNumId = numId.intValue();
+
+        XWPFNum xwpfNum = numbering.getNum(numId);
+
         if (xwpfNum == null) {
             return "";
         }
@@ -78,14 +86,15 @@ public class XWPFListManager extends AbstractListManager {
             overrideTuples = loadOverrideTuples(ctNum, lc.getNumberOfLevels());
         }
 
-        String formattedString = lc.incrementLevel(paragraph.getNumIlvl().intValue(), overrideTuples);
+        String formattedString = lc.incrementLevel(iLvl, overrideTuples);
 
         listLevelMap.put(currAbNumId, lc);
         overrideTupleMap.put(currNumId, overrideTuples);
 
         return formattedString;
+
     }
-    
+
     private LevelTuple[] loadOverrideTuples(CTNum ctNum, int length) {
         LevelTuple[] levelTuples = new LevelTuple[length];
         int overrideLength = ctNum.sizeOfLvlOverrideArray();

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index 408a591..aba736a 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -83,7 +83,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
     protected void buildXHTML(XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
         XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
-        XWPFListManager listManager = new XWPFListManager(document);
+        XWPFListManager listManager = new XWPFListManager(document.getNumbering());
         // headers
         if (hfPolicy != null) {
             extractHeaders(xhtml, hfPolicy, listManager);

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BinaryDataHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BinaryDataHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BinaryDataHandler.java
deleted file mode 100644
index c2177cf..0000000
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BinaryDataHandler.java
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.commons.codec.binary.Base64;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.EmbeddedDocumentUtil;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-
-class BinaryDataHandler extends PartHandler {
-
-    private final XHTMLContentHandler handler;
-    private final Metadata metadata;
-    private final ParseContext parseContext;
-
-    private boolean inBinaryData = false;
-    private StringBuilder buffer = new StringBuilder();
-
-    final Base64 base64 = new Base64();
-
-
-    public BinaryDataHandler(XHTMLContentHandler handler, Metadata metadata, ParseContext context) {
-        this.handler = handler;
-        this.metadata = metadata;
-        this.parseContext = context;
-    }
-
-
-    @Override
-    public void startDocument() throws SAXException {
-    }
-
-    @Override
-    public void endDocument() throws SAXException {
-
-    }
-
-    @Override
-    void endPart() throws SAXException, TikaException {
-        if (hasData()) {
-            EmbeddedDocumentExtractor embeddedDocumentExtractor =
-                    EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
-            Metadata embeddedMetadata = new Metadata();
-            try (TikaInputStream stream = TikaInputStream.get(getInputStream())) {
-                embeddedDocumentExtractor.parseEmbedded(stream, handler, embeddedMetadata, false);
-            } catch (IOException e) {
-                throw new TikaException("error in finishing part", e);
-            }
-            buffer.setLength(0);
-        }
-
-    }
-
-    @Override
-    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
-
-        if (uri.equals(Word2006MLHandler.PKG_NS) && localName.equals("binaryData")) {
-            inBinaryData = true;
-        }
-    }
-
-    @Override
-    public void endElement(String uri, String localName, String qName) throws SAXException {
-        if (uri.equals(Word2006MLHandler.PKG_NS) && localName.equals("binaryData")) {
-            inBinaryData = false;
-        }
-    }
-
-    @Override
-    public void characters(char[] ch, int start, int length) throws SAXException {
-        if (inBinaryData) {
-            buffer.append(ch, start, length);
-        }
-    }
-
-    @Override
-    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
-
-    }
-
-    @Override
-    public String getPartContentType() {
-        return "";
-    }
-
-    boolean hasData() {
-        return buffer.length() > 0;
-    }
-
-    private InputStream getInputStream() {
-        byte[] bytes = base64.decode(buffer.toString());
-        return new ByteArrayInputStream(bytes);
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BodyContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BodyContentHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BodyContentHandler.java
deleted file mode 100644
index ea16191..0000000
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/BodyContentHandler.java
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import org.apache.poi.xwpf.usermodel.XWPFRelation;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.microsoft.MSOfficeParserConfig;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-
-/**
- * This class is intended to handle anything that might contain IBodyElements:
- * main document, headers, footers, notes, etc.
- */
-
-class BodyContentHandler extends PartHandler {
-
-
-    private enum EditType{
-        NONE,
-        INSERT,
-        DELETE
-    };
-
-    private final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
-    private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
-    private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
-
-    private final static char[] TAB = new char[1];
-
-    static {
-        TAB[0] = '\t';
-    }
-
-    private final String partName;
-    private final RelationshipsManager relationshipsManager;
-    private final XHTMLContentHandler handler;
-    private final Metadata metadata;
-    private final ParseContext parseContext;
-    private final boolean includeDeletedContent;
-
-    private boolean inR = false;
-    private boolean inT = false;
-    private boolean inRPr = false;
-    private boolean inDelText = false;
-    private boolean inAlternateContent = false; //in alternate content section
-    private boolean inACChoice = false; //if in alternate, choice or fallback?
-    private boolean inACFallback = false;
-    private boolean hasWrittenAHref = false;
-    private boolean hasWrittenFormatting = false;
-    private String editAuthor = null;
-    private String editDate = null;
-    private EditType editType = EditType.NONE;
-    private String hyperlink = null;
-
-    private TmpFormatting currFormat = new TmpFormatting();
-
-    public BodyContentHandler(String partName, RelationshipsManager relationshipsManager,
-                              XHTMLContentHandler handler, Metadata metadata, ParseContext context) {
-        this.partName = partName;
-        this.relationshipsManager = relationshipsManager;
-        this.handler = handler;
-        this.metadata = metadata;
-        this.parseContext = context;
-        MSOfficeParserConfig config = context.get(MSOfficeParserConfig.class);
-        boolean tmpIncludeDeleted = true;
-        if (config != null) {
-            tmpIncludeDeleted = config.getIncludeDeletedContent();
-        }
-        includeDeletedContent = tmpIncludeDeleted;
-    }
-
-
-    @Override
-    public void startDocument() throws SAXException {
-    }
-
-    @Override
-    public void endDocument() throws SAXException {
-    }
-
-    @Override
-    public void startPrefixMapping(String prefix, String uri) throws SAXException {
-    }
-
-    @Override
-    public void endPrefixMapping(String prefix) throws SAXException {
-    }
-
-    @Override
-    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
-        if (uri.equals(MC_NS)) {
-            if (localName.equals("AlternateContent")) {
-                inAlternateContent = true;
-            } else if (localName.equals("Choice")) {
-                inACChoice = true;
-            } else if (localName.equals("Fallback")) {
-                inACFallback = true;
-            }
-        }
-        if (inACFallback) {
-            return;
-        }
-
-        if (uri.equals(W_NS)) {
-            if (localName.equals("p")) {
-                handler.startElement("p");
-            } else if (localName.equals("r")) {
-                inR = true;
-            } else if (localName.equals("t")) {
-                inT = true;
-            } else if (localName.equals("tab")) {
-                handler.characters(TAB, 0, 1);
-            } else if (localName.equals("tbl")) {
-                handler.startElement("table");
-            } else if (localName.equals("tc")) {
-                handler.startElement("td");
-            } else if (localName.equals("tr")) {
-                handler.startElement("tr");
-            } else if (localName.equals("rPr")) {
-                inRPr = true;
-            } else if (inR && inRPr && localName.equals("i")) {
-                //rprs don't have to be inR; ignore those that aren't
-                currFormat.italics = true;
-            } else if (inR && inRPr && localName.equals("b")) {
-                currFormat.bold = true;
-            } else if (localName.equals("delText")) {
-                inDelText = true;
-            } else if (localName.equals("ins")) {
-                editAuthor = atts.getValue(W_NS, "author");
-                editDate = atts.getValue(W_NS, "date");
-                editType = EditType.INSERT;
-            } else if (localName.equals("del")) {
-                editAuthor = atts.getValue(W_NS, "author");
-                editDate = atts.getValue(W_NS, "date");
-                editType = EditType.DELETE;
-            } else if (localName.equals("hyperlink")) {
-                String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
-                if (hyperlinkId != null) {
-                    Relationship relationship = relationshipsManager.getRelationship(getName(), hyperlinkId);
-                    if (relationship != null && XWPFRelation.HYPERLINK.getRelation().equals(relationship.getContentType())) {
-                        hyperlink = relationship.getTarget();
-                        handler.startElement("a", "href", hyperlink);
-                        hasWrittenAHref = true;
-                    }
-                }
-            }
-        }
-    }
-
-    @Override
-    public void endElement(String uri, String localName, String qName) throws SAXException {
-        if (uri.equals(MC_NS)) {
-            if (localName.equals("AlternateContent")) {
-                inAlternateContent = false;
-            } else if (localName.equals("Choice")) {
-                inACChoice = false;
-            } else if (localName.equals("Fallback")) {
-                inACFallback = false;
-            }
-        }
-        if (uri.equals(W_NS)) {
-            if (inACFallback) {
-                return;
-            }
-            if (localName.equals("p")) {
-                handler.endElement("p");
-            } else if (localName.equals("r")) {
-                closeStyleTags();
-                inR = false;
-                hasWrittenFormatting = false;
-            } else if (localName.equals("t")) {
-                inT = false;
-            } else if (localName.equals("tbl")) {
-                handler.endElement("table");
-            } else if (localName.equals("tc")) {
-                handler.endElement("td");
-            } else if (localName.equals("tr")) {
-                handler.endElement("tr");
-            } else if (localName.equals("rPr")) {
-                inRPr = false;
-            } else if (localName.equals("delText")) {
-                inDelText = false;
-            } else if (localName.equals("ins") || localName.equals("del")) {
-                editType = EditType.NONE;
-                editAuthor = null;
-                editDate = null;
-            } else if (localName.equals("hyperlink") && hasWrittenAHref) {
-                handler.endElement("a");
-                hasWrittenAHref = false;
-            }
-        }
-    }
-
-    @Override
-    public void characters(char[] ch, int start, int length) throws SAXException {
-        if (inACFallback) {
-            return;
-        }
-
-        if (inR && !hasWrittenFormatting) {
-            if (currFormat.bold) {
-                handler.startElement("b");
-            }
-            if (currFormat.italics) {
-                handler.startElement("i");
-            }
-            hasWrittenFormatting = true;
-        }
-        if (inT) {
-            handler.characters(ch, start, length);
-        } else if (includeDeletedContent && inDelText) {
-            handler.characters(ch, start, length);
-        }
-    }
-
-    @Override
-    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
-        if (inACFallback) {
-            return;
-        }
-
-        if (inT) {
-            handler.characters(ch, start, length);
-        }
-    }
-
-    @Override
-    public String getPartContentType() {
-        return partName;
-    }
-
-
-
-    void closeStyleTags() throws SAXException {
-        if (hasWrittenFormatting) {
-            if (currFormat.italics) {
-                handler.endElement("i");
-            }
-            if (currFormat.bold) {
-                handler.endElement("b");
-            }
-        }
-
-        currFormat.bold = false;
-        currFormat.italics = false;
-    }
-
-    private class TmpFormatting {
-        boolean italics = false;
-        boolean bold = false;
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/CorePropertiesHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/CorePropertiesHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/CorePropertiesHandler.java
deleted file mode 100644
index b0bca08..0000000
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/CorePropertiesHandler.java
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.poi.openxml4j.opc.ContentTypes;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-
-class CorePropertiesHandler extends PartHandler {
-
-    final static String DC_NS = "http://purl.org/dc/elements/1.1";
-    final static String DC_TERMS_NS = "http://purl.org/dc/terms";
-    final static String CP_NS = "http://schemas.openxmlformats.org/package/2006/metadata/core-properties";
-
-    private final Metadata metadata;
-
-    final StringBuilder buffer = new StringBuilder();
-    final Map<String, Map<String, Property>> properties = new HashMap<>();
-
-    public CorePropertiesHandler(Metadata metadata) {
-        this.metadata = metadata;
-        addProperties();
-    }
-
-    void addProperties() {
-        Map<String, Property> dc = properties.get(DC_NS);
-        if (dc == null) {
-            dc = new HashMap<>();
-        }
-        dc.put("creator", TikaCoreProperties.CREATOR);
-        dc.put("title", TikaCoreProperties.TITLE);
-        dc.put("description", TikaCoreProperties.DESCRIPTION);
-        properties.put(DC_NS, dc);
-
-        Map<String, Property> dcTerms = properties.get(DC_TERMS_NS);
-        if (dcTerms == null) {
-            dcTerms = new HashMap<>();
-        }
-        dcTerms.put("created", TikaCoreProperties.CREATED);
-        dcTerms.put("modified", TikaCoreProperties.MODIFIED);
-
-        properties.put(DC_TERMS_NS, dcTerms);
-
-        Map<String, Property> cp = properties.get(CP_NS);
-        if (cp == null) {
-            cp = new HashMap<>();
-        }
-        cp.put("category", OfficeOpenXMLCore.CATEGORY);
-        cp.put("contentStatus", OfficeOpenXMLCore.CONTENT_STATUS);
-        cp.put("lastModifiedBy", OfficeOpenXMLCore.LAST_MODIFIED_BY);
-        cp.put("lastPrinted", OfficeOpenXMLCore.LAST_PRINTED);
-        cp.put("revision", OfficeOpenXMLCore.REVISION);
-        cp.put("subject", OfficeOpenXMLCore.SUBJECT);
-        cp.put("version", OfficeOpenXMLCore.VERSION);
-        properties.put(CP_NS, cp);
-    }
-
-    @Override
-    public void startDocument() throws SAXException {
-    }
-
-    @Override
-    public void endDocument() throws SAXException {
-        buffer.setLength(0);
-    }
-
-    @Override
-    public void startPrefixMapping(String prefix, String uri) throws SAXException {
-    }
-
-    @Override
-    public void endPrefixMapping(String prefix) throws SAXException {
-    }
-
-    @Override
-    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
-
-    }
-
-    @Override
-    public void endElement(String uri, String localName, String qName) throws SAXException {
-        Property prop = getProperty(uri, localName);
-        if (prop != null) {
-
-            if (prop.isMultiValuePermitted()) {
-                metadata.add(prop, buffer.toString());
-            } else {
-                metadata.set(prop, buffer.toString());
-            }
-        }
-        buffer.setLength(0);
-
-    }
-
-    private Property getProperty(String uri, String localName) {
-        if (uri.endsWith("/")) {
-            uri = uri.substring(0, uri.length()-1);
-        }
-
-        Map<String, Property> m = properties.get(uri);
-        if (m != null) {
-            return m.get(localName);
-        }
-        return null;
-    }
-
-    @Override
-    public void characters(char[] ch, int start, int length) throws SAXException {
-        buffer.append(ch, start, length);
-    }
-
-    @Override
-    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
-        buffer.append(ch, start, length);
-    }
-
-    @Override
-    public String getPartContentType() {
-        return ContentTypes.CORE_PROPERTIES_PART;
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ExtendedPropertiesHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ExtendedPropertiesHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ExtendedPropertiesHandler.java
deleted file mode 100644
index 07e5e23..0000000
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ExtendedPropertiesHandler.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.Property;
-
-class ExtendedPropertiesHandler extends CorePropertiesHandler {
-
-    final static String EP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties";
-
-    public ExtendedPropertiesHandler(Metadata metadata) {
-        super(metadata);
-    }
-
-    @Override
-    void addProperties() {
-        Map<String, Property> ep = properties.get(EP_NS);
-        if (ep == null) {
-            ep = new HashMap<>();
-        }
-        ep.put("AppVersion", OfficeOpenXMLExtended.APP_VERSION);
-        ep.put("Application", OfficeOpenXMLExtended.APPLICATION);
-        ep.put("Comments", OfficeOpenXMLExtended.COMMENTS);
-        ep.put("Company", OfficeOpenXMLExtended.COMPANY);
-        ep.put("DocSecurity", OfficeOpenXMLExtended.DOC_SECURITY);
-        ep.put("HiddenSlides", OfficeOpenXMLExtended.HIDDEN_SLIDES);
-        ep.put("Manager", OfficeOpenXMLExtended.MANAGER);
-        ep.put("Notes", OfficeOpenXMLExtended.NOTES);
-        ep.put("PresentationFormat", OfficeOpenXMLExtended.PRESENTATION_FORMAT);
-        ep.put("Template", OfficeOpenXMLExtended.TEMPLATE);
-        ep.put("TotalTime", OfficeOpenXMLExtended.TOTAL_TIME);
-        ep.put("Pages", Office.PAGE_COUNT);
-        ep.put("Words", Office.WORD_COUNT);
-        ep.put("Characters", Office.CHARACTER_COUNT);
-        ep.put("CharactersWithSpaces", Office.CHARACTER_COUNT_WITH_SPACES);
-        ep.put("Paragraphs", Office.PARAGRAPH_COUNT);
-        ep.put("Lines", Office.LINE_COUNT);
-        properties.put(EP_NS, ep);
-    }
-
-    @Override
-    public String getPartContentType() {
-        return "application/vnd.openxmlformats-officedocument.extended-properties+xml";
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/PartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/PartHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/PartHandler.java
deleted file mode 100644
index 79bcafe..0000000
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/PartHandler.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-import org.apache.tika.exception.TikaException;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-abstract class PartHandler extends DefaultHandler {
-
-    private String name;
-
-    public abstract String getPartContentType();
-
-    public void setName(String name) {
-        this.name = name;
-    }
-
-    public String getName() {
-        return name;
-    }
-
-    /**
-     * Override this to flush buffers, etc if necessary
-     */
-    void endPart() throws SAXException, TikaException {
-
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Relationship.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Relationship.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Relationship.java
deleted file mode 100644
index 19b0dd4..0000000
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Relationship.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import org.apache.poi.openxml4j.opc.TargetMode;
-
-class Relationship {
-
-    private final String contentType;
-
-    private final String target;
-
-    private final TargetMode targetMode;
-
-    public Relationship(String contentType, String target) {
-        this(contentType, target, null);
-    }
-
-    public Relationship(String contentType, String target, TargetMode targetMode) {
-        this.contentType = contentType;
-        this.target = target;
-        this.targetMode = targetMode;
-    }
-
-    public String getContentType() {
-        return contentType;
-    }
-
-    public String getTarget() {
-        return target;
-    }
-
-    public TargetMode getTargetMode() {
-        return targetMode;
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsHandler.java
deleted file mode 100644
index 211b048..0000000
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsHandler.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import org.apache.poi.openxml4j.opc.ContentTypes;
-import org.apache.poi.openxml4j.opc.TargetMode;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-
-class RelationshipsHandler extends PartHandler {
-
-    final static String REL_NS = "http://schemas.openxmlformats.org/package/2006/relationships";
-
-    private final RelationshipsManager relationshipsManager;
-
-    public RelationshipsHandler(RelationshipsManager relationshipsManager) {
-        this.relationshipsManager = relationshipsManager;
-    }
-
-
-    @Override
-    public void startDocument() throws SAXException {
-    }
-
-    @Override
-    public void endDocument() throws SAXException {
-    }
-
-    @Override
-    public void startPrefixMapping(String prefix, String uri) throws SAXException {
-    }
-
-    @Override
-    public void endPrefixMapping(String prefix) throws SAXException {
-    }
-
-    @Override
-    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
-        if (uri.equals(REL_NS)) {
-            if (localName.equals("Relationship")) {
-                String id = atts.getValue("", "Id");
-                String type = atts.getValue("", "Type");
-                String target = atts.getValue("", "Target");
-                String targetModeString = atts.getValue("", "TargetMode");
-                TargetMode targetMode = "EXTERNAL".equals(targetModeString)? TargetMode.EXTERNAL :
-                        TargetMode.INTERNAL;
-                relationshipsManager.addRelationship(getName(), id, type, target, targetMode);
-            }
-        }
-    }
-
-    @Override
-    public void endElement(String uri, String localName, String qName) throws SAXException {
-    }
-
-    @Override
-    public void characters(char[] ch, int start, int length) throws SAXException {
-    }
-
-    @Override
-    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
-
-    }
-
-    @Override
-    public String getPartContentType() {
-        return ContentTypes.RELATIONSHIPS_PART;
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsManager.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsManager.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsManager.java
deleted file mode 100644
index d1954ac..0000000
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/RelationshipsManager.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.poi.openxml4j.opc.TargetMode;
-
-class RelationshipsManager {
-
-    Map<String, Map<String, Relationship>> map = new HashMap<>();
-
-    public void addRelationship(String relsFileName, String id, String type, String target, TargetMode targetMode) {
-        String packageName = convertRelsFileNameToPackageName(relsFileName);
-        Map<String, Relationship> thisPackageRels = map.get(packageName);
-        if (thisPackageRels == null) {
-            thisPackageRels = new HashMap<>();
-        }
-        thisPackageRels.put(id, new Relationship(type, target, targetMode));
-        map.put(packageName, thisPackageRels);
-    }
-
-    public Relationship getRelationship(String packageName, String id) {
-        Map<String, Relationship> thisPackageRels = map.get(packageName);
-        if (thisPackageRels != null) {
-            return thisPackageRels.get(id);
-        }
-        return null;
-    }
-
-    private String convertRelsFileNameToPackageName(String relsFileName) {
-        if ("/_rels/.rels".equals(relsFileName)) {
-            return "/";
-        }
-
-        String tmp = relsFileName;
-        tmp = tmp.replaceFirst("\\/_rels\\/", "/");
-        tmp = tmp.replaceFirst(".rels\\Z", "");
-        return tmp;
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLHandler.java
deleted file mode 100644
index cf919cc..0000000
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLHandler.java
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.poi.xwpf.usermodel.XWPFRelation;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-class Word2006MLHandler extends DefaultHandler {
-
-    final static String PKG_NS = "http://schemas.microsoft.com/office/2006/xmlPackage";
-
-
-    private final XHTMLContentHandler handler;
-    private final Metadata metadata;
-    private final ParseContext parseContext;
-
-    private final Map<String, PartHandler> partHandlers = new HashMap<>();
-    private final BinaryDataHandler binaryDataHandler;
-    private final RelationshipsManager relationshipsManager = new RelationshipsManager();
-    private PartHandler currentPartHandler = null;
-
-    public Word2006MLHandler(XHTMLContentHandler handler, Metadata metadata, ParseContext context) {
-        this.handler = handler;
-        this.metadata = metadata;
-        this.parseContext = context;
-
-        addPackageHandler(new RelationshipsHandler(relationshipsManager));
-
-        addPackageHandler(new BodyContentHandler(
-                XWPFRelation.DOCUMENT.getContentType(),
-                relationshipsManager,
-                handler, metadata, context));
-        addPackageHandler(new BodyContentHandler(
-                XWPFRelation.FOOTNOTE.getContentType(),
-                relationshipsManager,
-                handler, metadata, context));
-        addPackageHandler(new BodyContentHandler(
-                "application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml",
-                relationshipsManager,
-                handler, metadata, context));
-        addPackageHandler(new BodyContentHandler(
-                XWPFRelation.HEADER.getContentType(),
-                relationshipsManager,
-                handler, metadata, context));
-        addPackageHandler(new BodyContentHandler(
-                XWPFRelation.FOOTER.getContentType(),
-                relationshipsManager,
-                handler, metadata, context));
-        addPackageHandler(new BodyContentHandler(
-                "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml",
-                relationshipsManager,
-                handler, metadata, context));
-        addPackageHandler(new BodyContentHandler(
-                "application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml",
-                relationshipsManager,
-                handler, metadata, context));
-        addPackageHandler(new BodyContentHandler(
-                "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml",
-                relationshipsManager,
-                handler, metadata, context));
-
-        addPackageHandler(new CorePropertiesHandler(metadata));
-        addPackageHandler(new ExtendedPropertiesHandler(metadata));
-        binaryDataHandler = new BinaryDataHandler(handler, metadata, context);
-    }
-
-    private void addPackageHandler(PartHandler partHandler) {
-        partHandlers.put(partHandler.getPartContentType(), partHandler);
-    }
-
-
-    @Override
-    public void startDocument() throws SAXException {
-    }
-
-    @Override
-    public void endDocument() throws SAXException {
-    }
-
-    @Override
-    public void startPrefixMapping(String prefix, String uri) throws SAXException {
-    }
-
-    @Override
-    public void endPrefixMapping(String prefix) throws SAXException {
-
-    }
-
-    @Override
-    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
-        if (uri.equals(PKG_NS) && localName.equals("part")) {
-            //start of a package
-            String name = atts.getValue(PKG_NS, "name");
-            String contentType = atts.getValue(PKG_NS, "contentType");
-            currentPartHandler = partHandlers.get(contentType);
-            //for now treat every unknown part type
-            //as if it contained binary data
-            if (currentPartHandler == null) {
-                currentPartHandler = binaryDataHandler;
-            }
-            if (currentPartHandler != null) {
-                currentPartHandler.setName(name);
-            }
-        } else if (currentPartHandler != null) {
-            currentPartHandler.startElement(uri, localName, qName, atts);
-        }
-
-    }
-
-    @Override
-    public void endElement(String uri, String localName, String qName) throws SAXException {
-        if (uri.equals(PKG_NS) && localName.equals("part")) {
-            //do post processing
-            if (currentPartHandler != null) {
-                try {
-                    currentPartHandler.endPart();
-                } catch (TikaException e) {
-                    throw new SAXException(e);
-                }
-            }
-            //then reset
-            currentPartHandler = null;
-        } else if (currentPartHandler != null) {
-            currentPartHandler.endElement(uri, localName, qName);
-        }
-    }
-
-    @Override
-    public void characters(char[] ch, int start, int length) throws SAXException {
-        if (currentPartHandler != null) {
-            currentPartHandler.characters(ch, start, length);
-        }
-    }
-
-    @Override
-    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
-        if (currentPartHandler != null) {
-            currentPartHandler.characters(ch, start, length);
-        }
-
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParser.java
deleted file mode 100644
index 4609bf5..0000000
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParser.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.EmbeddedContentHandler;
-import org.apache.tika.sax.OfflineContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-
-public class Word2006MLParser extends AbstractParser {
-
-    protected static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(
-                    MediaType.application("vnd.ms-word2006ml"));
-
-    @Override
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    @Override
-    public void parse(InputStream stream, ContentHandler handler,
-                      Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
-        final XHTMLContentHandler xhtml =
-                new XHTMLContentHandler(handler, metadata);
-
-        xhtml.startDocument();
-
-        try {
-            context.getSAXParser().parse(
-                    new CloseShieldInputStream(stream),
-                    new OfflineContentHandler(new EmbeddedContentHandler(
-                            new Word2006MLHandler(xhtml, metadata, context))));
-        } catch (SAXException e) {
-            throw new TikaException("XML parse error", e);
-        } finally {
-            xhtml.endDocument();
-        }
-    }
-}