You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/11/30 20:31:15 UTC

[6/7] tika git commit: TIKA 1321 initial commit

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
new file mode 100644
index 0000000..dce36a2
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
@@ -0,0 +1,318 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+
+import java.util.Date;
+import java.util.Map;
+
+import org.apache.tika.utils.DateUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This class is intended to handle anything that might contain IBodyElements:
+ * main document, headers, footers, notes, etc.
+ */
+
+public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
+
+
+    enum EditType {
+        NONE,
+        INSERT,
+        DELETE,
+        MOVE_TO,
+        MOVE_FROM
+    }
+
+
+    private final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
+    private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
+    private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
+
+    private final static char[] TAB = new char[1];
+
+    static {
+        TAB[0] = '\t';
+    }
+
+    private final XWPFBodyContentsHandler bodyContentsHandler;
+    //private final RelationshipsManager relationshipsManager;
+    private final Map<String, String> hyperlinks;
+
+    private final StringBuilder runBuffer = new StringBuilder();
+
+    private boolean inR = false;
+    private boolean inT = false;
+    private int pDepth = 0;
+    private boolean inRPr = false;
+    private boolean inNumPr = false;
+    private boolean inDelText = false;
+    private boolean inHyperlink = false;
+
+    //alternate content can be embedded in itself.
+    //need to track depth.
+    //if in alternate, choose fallback, maybe make this configurable?
+    private int inACChoiceDepth = 0;
+    private int inACFallbackDepth = 0;
+    private EditType editType = EditType.NONE;
+    private String hyperlink = null;
+
+    private XWPFRunProperties currRunProperties = new XWPFRunProperties();
+
+    public XWPFDocumentXMLBodyHandler(XWPFBodyContentsHandler bodyContentsHandler,
+                                      Map<String, String> hyperlinks) {
+        this.bodyContentsHandler = bodyContentsHandler;
+        this.hyperlinks = hyperlinks;
+    }
+
+
+    @Override
+    public void startDocument() throws SAXException {
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws SAXException {
+    }
+
+    @Override
+    public void endPrefixMapping(String prefix) throws SAXException {
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+        if (uri != null && uri.equals(MC_NS)) {
+            if (localName.equals("Choice")) {
+                inACChoiceDepth++;
+            } else if (localName.equals("Fallback")) {
+                inACFallbackDepth++;
+            }
+        }
+
+        if (inACChoiceDepth > 0) {
+            return;
+        }
+
+        if (uri == null || uri.equals(W_NS)) {
+            if (localName.equals("p")) {
+                bodyContentsHandler.startParagraph();
+                pDepth++;
+            } else if (localName.equals("r")) {
+                inR = true;
+            } else if (localName.equals("t")) {
+                inT = true;
+            } else if (localName.equals("tab")) {
+                runBuffer.append("\t");
+            } else if (localName.equals("tbl")) {
+                bodyContentsHandler.startTable();
+            } else if (localName.equals("tc")) {
+                bodyContentsHandler.startTableCell();
+            } else if (localName.equals("tr")) {
+                bodyContentsHandler.startTableRow();
+            } else if (localName.equals("numPr")) {
+                inNumPr = true;
+            } else if (localName.equals("rPr")) {
+                inRPr = true;
+            } else if (inR && inRPr && localName.equals("i")) {
+                //rprs don't have to be inR; ignore those that aren't
+                currRunProperties.setItalics(true);
+            } else if (inR && inRPr && localName.equals("b")) {
+                currRunProperties.setBold(true);
+            } else if (localName.equals("delText")) {
+                inDelText = true;
+            } else if (localName.equals("ins")) {
+                startEditedSection(editType.INSERT, atts);
+            } else if (localName.equals("del")) {
+                startEditedSection(editType.DELETE, atts);
+            } else if (localName.equals("moveTo")) {
+                startEditedSection(EditType.MOVE_TO, atts);
+            } else if (localName.equals("moveFrom")) {
+                startEditedSection(editType.MOVE_FROM, atts);
+            } else if (localName.equals("hyperlink")) {
+                String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+                if (hyperlinkId != null) {
+                    hyperlink = hyperlinks.get(hyperlinkId);
+                }
+                inHyperlink = true;
+            } else if (localName.equals("footnoteReference")) {
+                String id = atts.getValue(W_NS, "id");
+                bodyContentsHandler.footnoteReference(id);
+            } else if (localName.equals("endnoteReference")) {
+                String id = atts.getValue(W_NS, "id");
+                bodyContentsHandler.endnoteReference(id);
+            } /*else if (localName.equals("headerReference")) {
+                //TODO
+            } else if (localName.equals("footerReference")) {
+                //TODO
+            } else if (localName.equals("commentRangeEnd")) {
+                //TODO
+            }*/
+        }
+    }
+
+    private void startEditedSection(EditType editType, Attributes atts) {
+        String editAuthor = atts.getValue(W_NS, "author");
+        String editDateString = atts.getValue(W_NS, "date");
+        Date editDate = null;
+        if (editDateString != null) {
+            editDate = DateUtils.tryToParse(editDateString);
+        }
+        bodyContentsHandler.startEditedSection(editAuthor, editDate, editType);
+        this.editType = editType;
+    }
+
+    private int getIntVal(Attributes atts) {
+        String valString = atts.getValue(W_NS, "val");
+        if (valString != null) {
+            try {
+                return Integer.parseInt(valString);
+            } catch (NumberFormatException e) {
+                //swallow
+            }
+        }
+        return -1;
+    }
+
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+        if (uri.equals(MC_NS)) {
+            if (localName.equals("Choice")) {
+                inACChoiceDepth--;
+            } else if (localName.equals("Fallback")) {
+                inACFallbackDepth--;
+            }
+        }
+        if (uri == null || uri.equals(W_NS)) {
+            if (inACChoiceDepth > 0) {
+                return;
+            }
+
+
+            if (localName.equals("r") && !inHyperlink) {
+                bodyContentsHandler.run(currRunProperties, runBuffer.toString());
+                inR = false;
+                runBuffer.setLength(0);
+                currRunProperties.setBold(false);
+                currRunProperties.setItalics(false);
+            } else if (localName.equals("p")) {
+                bodyContentsHandler.endParagraph();
+                pDepth--;
+            } else if (localName.equals("t")) {
+                inT = false;
+            } else if (localName.equals("tbl")) {
+                bodyContentsHandler.endTable();
+            } else if (localName.equals("tc")) {
+                bodyContentsHandler.endTableCell();
+            } else if (localName.equals("tr")) {
+                bodyContentsHandler.endTableRow();
+            } else if (localName.equals("rPr")) {
+                inRPr = false;
+            } else if (localName.equals("delText")) {
+                inDelText = false;
+            } else if (localName.equals("ins") || localName.equals("del") ||
+                    localName.equals("moveTo") || localName.equals("moveFrom")) {
+                editType = EditType.NONE;
+            } else if (localName.equals("hyperlink")) {
+                if (hyperlink != null) {
+                    bodyContentsHandler.hyperlinkRun(hyperlink, runBuffer.toString());
+                } else {
+                    bodyContentsHandler.run(currRunProperties, runBuffer.toString());
+                }
+                runBuffer.setLength(0);
+                inHyperlink = false;
+            }
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+
+        if (inACChoiceDepth > 0) {
+            return;
+        }
+        if (editType.equals(EditType.MOVE_FROM) && inT) {
+            if (bodyContentsHandler.getIncludeMoveFromText()) {
+                runBuffer.append(ch, start, length);
+            }
+        } else if (inT) {
+            runBuffer.append(ch, start, length);
+        } else if (bodyContentsHandler.getIncludeDeletedText() && editType.equals(EditType.DELETE)) {
+            runBuffer.append(ch, start, length);
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+        if (inACChoiceDepth > 0) {
+            return;
+        }
+
+        if (inT) {
+            runBuffer.append(ch, start, length);
+        } else if (bodyContentsHandler.getIncludeDeletedText() && inDelText) {
+            runBuffer.append(ch, start, length);
+        }
+    }
+
+
+    public interface XWPFBodyContentsHandler {
+
+        void run(XWPFRunProperties runProperties, String contents);
+
+        void hyperlinkRun(String link, String text);
+
+        void startParagraph();
+
+        void endParagraph();
+
+        void startTable();
+
+        void endTable();
+
+        void startTableRow();
+
+        void endTableRow();
+
+        void startTableCell();
+
+        void endTableCell();
+
+        void startSDT();
+
+        void endSDT();
+
+        void startEditedSection(String editor, Date date, EditType editType);
+
+        void endEditedSection();
+
+        boolean getIncludeDeletedText();
+
+        void footnoteReference(String id);
+
+        void endnoteReference(String id);
+
+        boolean getIncludeMoveFromText();
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
new file mode 100644
index 0000000..06ef951
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -0,0 +1,353 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+import javax.xml.parsers.ParserConfigurationException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLProperties;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.util.SAXHelper;
+import org.apache.poi.xwpf.usermodel.XWPFNumbering;
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+//TODO: move this into POI?
+/**
+ * Experimental class that is based on POI's XSSFEventBasedExcelExtractor
+ *
+ */
+public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
+
+    private OPCPackage container;
+    private POIXMLProperties properties;
+
+    public XWPFEventBasedWordExtractor(String path) throws XmlException, OpenXML4JException, IOException {
+        this(OPCPackage.open(path));
+    }
+
+    public XWPFEventBasedWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
+        super((POIXMLDocument) null);
+        this.container = container;
+        this.properties = new POIXMLProperties(container);
+    }
+
+
+    public static void main(String[] args) throws Exception {
+        if (args.length < 1) {
+            System.err.println("Use:");
+            System.err.println("  XWPFEventBasedWordExtractor <filename.xlsx>");
+            System.exit(1);
+        }
+
+        XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(args[0]);
+        System.out.println(extractor.getText());
+        extractor.close();
+    }
+
+    public OPCPackage getPackage() {
+        return this.container;
+    }
+
+    public POIXMLProperties.CoreProperties getCoreProperties() {
+        return this.properties.getCoreProperties();
+    }
+
+    public POIXMLProperties.ExtendedProperties getExtendedProperties() {
+        return this.properties.getExtendedProperties();
+    }
+
+    public POIXMLProperties.CustomProperties getCustomProperties() {
+        return this.properties.getCustomProperties();
+    }
+
+
+    @Override
+    public String getText() {
+        StringBuilder sb = new StringBuilder();
+        //handle main document
+        List<PackagePart> pps = container.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+        if (pps != null) {
+            for (PackagePart pp : pps) {
+                //likely only one, but why not...
+                try {
+                    handleDocumentPart(pp, sb);
+                } catch (IOException e) {
+                    e.printStackTrace();
+                } catch (SAXException e) {
+                    e.printStackTrace();
+                }
+            }
+        }
+        //handle glossary document
+        pps = container.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
+
+        if (pps != null) {
+            for (PackagePart pp : pps) {
+                //likely only one, but why not...
+                try {
+                    handleDocumentPart(pp, sb);
+                } catch (IOException e) {
+                    e.printStackTrace();
+                } catch (SAXException e) {
+                    e.printStackTrace();
+                }
+            }
+        }
+
+        return sb.toString();
+    }
+
+
+    private void handleDocumentPart(PackagePart documentPart, StringBuilder sb) throws IOException, SAXException {
+        //load the numbering/list manager and styles from the main document part
+        XWPFNumbering numbering = loadNumbering(documentPart);
+        XWPFListManager xwpfListManager = new XWPFListManager(numbering);
+        //TODO: XWPFStyles styles = loadStyles(documentPart);
+
+        //headers
+        try {
+            PackageRelationshipCollection headersPRC = documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation());
+            if (headersPRC != null) {
+                for (int i = 0; i < headersPRC.size(); i++) {
+                    PackagePart header = documentPart.getRelatedPart(headersPRC.getRelationship(i));
+                    handlePart(header, xwpfListManager, sb);
+                }
+            }
+        } catch (InvalidFormatException e) {
+            //swallow
+        }
+
+        //main document
+        handlePart(documentPart, xwpfListManager, sb);
+
+        //for now, just dump other components at end
+        for (XWPFRelation rel : new XWPFRelation[]{
+                XWPFRelation.FOOTNOTE,
+                XWPFRelation.COMMENT,
+                XWPFRelation.FOOTER,
+                XWPFRelation.ENDNOTE
+        }) {
+            try {
+                PackageRelationshipCollection prc = documentPart.getRelationshipsByType(rel.getRelation());
+                if (prc != null) {
+                    for (int i = 0; i < prc.size(); i++) {
+                        PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
+                        handlePart(packagePart, xwpfListManager, sb);
+                    }
+                }
+            } catch (InvalidFormatException e) {
+                //swallow
+            }
+        }
+    }
+
+    private void handlePart(PackagePart packagePart,
+                            XWPFListManager xwpfListManager, StringBuilder buffer) throws IOException, SAXException {
+
+        Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
+        try (InputStream stream = packagePart.getInputStream()) {
+            XMLReader reader = SAXHelper.newXMLReader();
+            reader.setContentHandler(new XWPFDocumentXMLBodyHandler(
+                    new XWPFToTextContentHandler(buffer), hyperlinks));
+            reader.parse(new InputSource(new CloseShieldInputStream(stream)));
+
+        } catch (ParserConfigurationException e) {
+            e.printStackTrace();
+        }
+
+    }
+
+    private Map<String, String> loadHyperlinkRelationships(PackagePart bodyPart) {
+        Map<String, String> hyperlinks = new HashMap<>();
+        try {
+            PackageRelationshipCollection prc = bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
+            for (int i = 0; i < prc.size(); i++) {
+                PackageRelationship pr = prc.getRelationship(i);
+                if (pr == null) {
+                    continue;
+                }
+                String id = pr.getId();
+                String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
+                if (id != null && url != null) {
+                    hyperlinks.put(id, url);
+                }
+            }
+        } catch (InvalidFormatException e) {
+        }
+        return hyperlinks;
+    }
+/*
+    private XWPFStyles loadStyles(PackagePart packagePart) {
+        try {
+            PackageRelationshipCollection stylesParts =
+                    packagePart.getRelationshipsByType(XWPFRelation.STYLES.getRelation());
+            if (stylesParts.size() > 0) {
+                PackageRelationship stylesRelationShip = stylesParts.getRelationship(0);
+                if (stylesRelationShip == null) {
+                    return null;
+                }
+                PackagePart stylesPart = opcPackage.getPart(stylesRelationShip);
+                if (stylesPart == null) {
+                    return null;
+                }
+                return new XWPFStyles(stylesPart);
+            }
+        } catch (IOException|OpenXML4JException e) {
+            //swallow
+        }
+        return null;
+
+    }
+*/
+    private XWPFNumbering loadNumbering(PackagePart packagePart) {
+        try {
+            PackageRelationshipCollection numberingParts = packagePart.getRelationshipsByType(XWPFRelation.NUMBERING.getRelation());
+            if (numberingParts.size() > 0) {
+                PackageRelationship numberingRelationShip = numberingParts.getRelationship(0);
+                if (numberingRelationShip == null) {
+                    return null;
+                }
+                PackagePart numberingPart = container.getPart(numberingRelationShip);
+                if (numberingPart == null) {
+                    return null;
+                }
+                return new XWPFNumbering(numberingPart);
+            }
+        } catch (IOException | OpenXML4JException e) {
+            //swallow
+        }
+        return null;
+    }
+
+    private class XWPFToTextContentHandler implements XWPFDocumentXMLBodyHandler.XWPFBodyContentsHandler {
+        private final StringBuilder buffer;
+
+        public XWPFToTextContentHandler(StringBuilder buffer) {
+            this.buffer = buffer;
+        }
+
+        @Override
+        public void run(XWPFRunProperties runProperties, String contents) {
+            buffer.append(contents);
+        }
+
+        @Override
+        public void hyperlinkRun(String link, String text) {
+            buffer.append(" (").append(text).append(") ");
+        }
+
+        @Override
+        public void startParagraph() {
+            //no-op
+        }
+
+        @Override
+        public void endParagraph() {
+            buffer.append("\n");
+        }
+
+        @Override
+        public void startTable() {
+
+        }
+
+        @Override
+        public void endTable() {
+
+        }
+
+        @Override
+        public void startTableRow() {
+
+        }
+
+        @Override
+        public void endTableRow() {
+            buffer.append("\n");
+        }
+
+        @Override
+        public void startTableCell() {
+
+        }
+
+        @Override
+        public void endTableCell() {
+            buffer.append("\t");
+        }
+
+        @Override
+        public void startSDT() {
+
+        }
+
+        @Override
+        public void endSDT() {
+            buffer.append("\n");
+        }
+
+        @Override
+        public void startEditedSection(String editor, Date date, XWPFDocumentXMLBodyHandler.EditType editType) {
+
+        }
+
+        @Override
+        public void endEditedSection() {
+
+        }
+
+        @Override
+        public boolean getIncludeDeletedText() {
+            return true;
+        }
+
+        @Override
+        public void footnoteReference(String id) {
+
+        }
+
+        @Override
+        public void endnoteReference(String id) {
+
+        }
+
+        @Override
+        public boolean getIncludeMoveFromText() {
+            return false;
+        }
+    }
+}
+

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFRunProperties.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFRunProperties.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFRunProperties.java
new file mode 100644
index 0000000..ad2d656
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFRunProperties.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+/**
+ * WARNING: This class is mutable.  Make a copy of it
+ * if you want persistence!
+ */
+
+class XWPFRunProperties {
+    boolean italics = false;
+    boolean bold = false;
+
+    public boolean getItalics() {
+        return italics;
+    }
+
+    public boolean getBold() {
+        return bold;
+    }
+
+    public void setItalics(boolean italics) {
+        this.italics = italics;
+    }
+
+    public void setBold(boolean bold) {
+        this.bold = bold;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
new file mode 100644
index 0000000..2f27739
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+
+import java.util.Date;
+
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFBodyContentsHandler {
+
+    private final static char[] NEWLINE = new char[]{'\n'};
+    private final static char[] TAB = new char[]{'\t'};
+
+    private final XHTMLContentHandler xhtml;
+    private final XWPFListManager listManager;
+    private final boolean includeDeletedText;
+    private final boolean includeMoveFromText;
+
+    private int pDepth = 0; //paragraph depth
+    private boolean isItalics = false;
+    private boolean isBold = false;
+
+    public XWPFTikaBodyPartHandler(XHTMLContentHandler xhtml, XWPFListManager listManager, OfficeParserConfig parserConfig) {
+        this.xhtml = xhtml;
+        this.listManager = listManager;
+        this.includeDeletedText = parserConfig.getIncludeDeletedContent();
+        this.includeMoveFromText = parserConfig.getIncludeMoveFromContent();
+    }
+
+    @Override
+    public void run(XWPFRunProperties runProperties, String contents) {
+        //TODO: smooth out bold/italics to handle only changes
+        //If two runs are bold, only add <b> at beginning and end of the run pair
+        try {
+            if (runProperties.getBold()) {
+                xhtml.startElement("b");
+            }
+            if (runProperties.getItalics()) {
+                xhtml.startElement("i");
+            }
+
+            xhtml.characters(contents);
+            if (runProperties.getItalics()) {
+                xhtml.endElement("i");
+            }
+            if (runProperties.getBold()) {
+                xhtml.endElement("b");
+            }
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void hyperlinkRun(String link, String text) {
+        //System.out.println("tika handler: "+link + " :: "+text);
+        try {
+            if (link != null) {
+                xhtml.startElement("a", "href", link);
+            }
+            xhtml.characters(text);
+            if (link != null) {
+                xhtml.endElement("a");
+            }
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void startParagraph() {
+        if (pDepth == 0) {
+            try {
+                xhtml.startElement("p");
+            } catch (SAXException e) {
+
+            }
+        }
+        pDepth++;
+    }
+
+    @Override
+    public void endParagraph() {
+        try {
+            if (pDepth == 1) {
+                xhtml.endElement("p");
+            } else {
+                xhtml.characters(NEWLINE, 0, 1);
+            }
+        } catch (SAXException e) {
+
+        }
+        pDepth--;
+    }
+
+    @Override
+    public void startTable() {
+        try {
+            xhtml.startElement("table");
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void endTable() {
+        try {
+            xhtml.endElement("table");
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void startTableRow() {
+        try {
+            xhtml.startElement("tr");
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void endTableRow() {
+        try {
+            xhtml.endElement("tr");
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void startTableCell() {
+        try {
+            xhtml.startElement("td");
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void endTableCell() {
+        try {
+            xhtml.endElement("td");
+        } catch (SAXException e) {
+
+        }
+    }
+
+    @Override
+    public void startSDT() {
+        //no-op
+    }
+
+    @Override
+    public void endSDT() {
+        //no-op
+    }
+
+    @Override
+    public void startEditedSection(String editor, Date date, XWPFDocumentXMLBodyHandler.EditType editType) {
+        //no-op
+    }
+
+    @Override
+    public void endEditedSection() {
+        //no-op
+    }
+
+    @Override
+    public boolean getIncludeDeletedText() {
+        return includeDeletedText;
+    }
+
+    @Override
+    public void footnoteReference(String id) {
+        if (id != null) {
+            try {
+                xhtml.characters("[");
+                xhtml.characters(id);
+                xhtml.characters("]");
+            } catch (SAXException e) {
+
+            }
+        }
+    }
+
+    @Override
+    public void endnoteReference(String id) {
+        if (id != null) {
+            try {
+                xhtml.characters("[");
+                xhtml.characters(id);
+                xhtml.characters("]");
+            } catch (SAXException e) {
+
+            }
+        }
+    }
+
+    @Override
+    public boolean getIncludeMoveFromText() {
+        return includeMoveFromText;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/AbstractPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/AbstractPartHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/AbstractPartHandler.java
new file mode 100644
index 0000000..9aa5471
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/AbstractPartHandler.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+abstract class AbstractPartHandler extends DefaultHandler implements PartHandler {
+
+    private String name;
+
+    public abstract String getContentType();
+
+    public void setName(String name) {
+        this.name = name;
+    }
+
+    public String getName() {
+        return name;
+    }
+
+    /**
+     * Override this to flush buffers, etc if necessary
+     */
+    public void endPart() throws SAXException, TikaException {
+        //no-op
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BinaryDataHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BinaryDataHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BinaryDataHandler.java
new file mode 100644
index 0000000..4a13799
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BinaryDataHandler.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+class BinaryDataHandler extends AbstractPartHandler {
+
+    private final XHTMLContentHandler handler;
+    private final Metadata metadata;
+    private final ParseContext parseContext;
+
+    private boolean inBinaryData = false;
+    private StringBuilder buffer = new StringBuilder();
+
+    final Base64 base64 = new Base64();
+
+
+    public BinaryDataHandler(XHTMLContentHandler handler, Metadata metadata, ParseContext context) {
+        this.handler = handler;
+        this.metadata = metadata;
+        this.parseContext = context;
+    }
+
+
+    @Override
+    public void startDocument() throws SAXException {
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+
+    }
+
+    @Override
+    public void endPart() throws SAXException, TikaException {
+        if (hasData()) {
+            EmbeddedDocumentExtractor embeddedDocumentExtractor =
+                    EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
+            Metadata embeddedMetadata = new Metadata();
+            try (TikaInputStream stream = TikaInputStream.get(getInputStream())) {
+                embeddedDocumentExtractor.parseEmbedded(stream, handler, embeddedMetadata, false);
+            } catch (IOException e) {
+                throw new TikaException("error in finishing part", e);
+            }
+            buffer.setLength(0);
+        }
+
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+
+        if (uri.equals(Word2006MLDocHandler.PKG_NS) && localName.equals("binaryData")) {
+            inBinaryData = true;
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+        if (uri.equals(Word2006MLDocHandler.PKG_NS) && localName.equals("binaryData")) {
+            inBinaryData = false;
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+        if (inBinaryData) {
+            buffer.append(ch, start, length);
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+
+    }
+
+    @Override
+    public String getContentType() {
+        return "";
+    }
+
+    boolean hasData() {
+        return buffer.length() > 0;
+    }
+
+    private InputStream getInputStream() {
+        byte[] bytes = base64.decode(buffer.toString());
+        return new ByteArrayInputStream(bytes);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BodyPartHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BodyPartHandler.java
new file mode 100644
index 0000000..4d04b2b
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BodyPartHandler.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+import java.util.HashMap;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFDocumentXMLBodyHandler;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFTikaBodyPartHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+
+/**
+ * Simple wrapper/extension of XWPFDocumentXMLBodyHandler to fit
+ * into the inline parsing scheme.
+ */
+class BodyPartHandler extends XWPFDocumentXMLBodyHandler implements PartHandler {
+
+    private final String contentType;
+    private String name;
+    public BodyPartHandler(String contentType, XHTMLContentHandler xhtml,
+                           RelationshipsManager relationshipsManager,
+                           OfficeParserConfig officeParserConfig) {
+        super(new XWPFTikaBodyPartHandler(xhtml, null, officeParserConfig),
+                new HashMap<String, String>());
+        this.contentType = contentType;
+    }
+
+    @Override
+    public void setName(String name) {
+        this.name = name;
+    }
+
+    @Override
+    public String getName() {
+        return name;
+    }
+
+    @Override
+    public String getContentType() {
+        return contentType;
+    }
+
+    @Override
+    public void endPart() throws SAXException, TikaException {
+        //no-op
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/CorePropertiesHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/CorePropertiesHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/CorePropertiesHandler.java
new file mode 100644
index 0000000..c746e5c
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/CorePropertiesHandler.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.poi.openxml4j.opc.ContentTypes;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+class CorePropertiesHandler extends AbstractPartHandler {
+
+    final static String DC_NS = "http://purl.org/dc/elements/1.1";
+    final static String DC_TERMS_NS = "http://purl.org/dc/terms";
+    final static String CP_NS = "http://schemas.openxmlformats.org/package/2006/metadata/core-properties";
+
+    private final Metadata metadata;
+
+    final StringBuilder buffer = new StringBuilder();
+    final Map<String, Map<String, Property>> properties = new HashMap<>();
+
+    public CorePropertiesHandler(Metadata metadata) {
+        this.metadata = metadata;
+        addProperties();
+    }
+
+    void addProperties() {
+        Map<String, Property> dc = properties.get(DC_NS);
+        if (dc == null) {
+            dc = new HashMap<>();
+        }
+        dc.put("creator", TikaCoreProperties.CREATOR);
+        dc.put("title", TikaCoreProperties.TITLE);
+        dc.put("description", TikaCoreProperties.DESCRIPTION);
+        properties.put(DC_NS, dc);
+
+        Map<String, Property> dcTerms = properties.get(DC_TERMS_NS);
+        if (dcTerms == null) {
+            dcTerms = new HashMap<>();
+        }
+        dcTerms.put("created", TikaCoreProperties.CREATED);
+        dcTerms.put("modified", TikaCoreProperties.MODIFIED);
+
+        properties.put(DC_TERMS_NS, dcTerms);
+
+        Map<String, Property> cp = properties.get(CP_NS);
+        if (cp == null) {
+            cp = new HashMap<>();
+        }
+        cp.put("category", OfficeOpenXMLCore.CATEGORY);
+        cp.put("contentStatus", OfficeOpenXMLCore.CONTENT_STATUS);
+        cp.put("lastModifiedBy", TikaCoreProperties.MODIFIER);
+        cp.put("lastPrinted", OfficeOpenXMLCore.LAST_PRINTED);
+        cp.put("revision", OfficeOpenXMLCore.REVISION);
+        cp.put("subject", OfficeOpenXMLCore.SUBJECT);
+        cp.put("version", OfficeOpenXMLCore.VERSION);
+        properties.put(CP_NS, cp);
+    }
+
+    @Override
+    public void startDocument() throws SAXException {
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+        buffer.setLength(0);
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws SAXException {
+    }
+
+    @Override
+    public void endPrefixMapping(String prefix) throws SAXException {
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+        Property prop = getProperty(uri, localName);
+        if (prop != null) {
+
+            if (prop.isMultiValuePermitted()) {
+                metadata.add(prop, buffer.toString());
+            } else {
+                metadata.set(prop, buffer.toString());
+            }
+        }
+        buffer.setLength(0);
+
+    }
+
+    private Property getProperty(String uri, String localName) {
+        if (uri.endsWith("/")) {
+            uri = uri.substring(0, uri.length()-1);
+        }
+
+        Map<String, Property> m = properties.get(uri);
+        if (m != null) {
+            return m.get(localName);
+        }
+        return null;
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+        buffer.append(ch, start, length);
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+        buffer.append(ch, start, length);
+    }
+
+    @Override
+    public String getContentType() {
+        return ContentTypes.CORE_PROPERTIES_PART;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/ExtendedPropertiesHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/ExtendedPropertiesHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/ExtendedPropertiesHandler.java
new file mode 100644
index 0000000..74238a6
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/ExtendedPropertiesHandler.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.Property;
+
+class ExtendedPropertiesHandler extends CorePropertiesHandler {
+
+    final static String EP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties";
+
+    public ExtendedPropertiesHandler(Metadata metadata) {
+        super(metadata);
+    }
+
+    @Override
+    void addProperties() {
+        Map<String, Property> ep = properties.get(EP_NS);
+        if (ep == null) {
+            ep = new HashMap<>();
+        }
+        ep.put("AppVersion", OfficeOpenXMLExtended.APP_VERSION);
+        ep.put("Application", OfficeOpenXMLExtended.APPLICATION);
+        ep.put("Comments", OfficeOpenXMLExtended.COMMENTS);
+        ep.put("Company", OfficeOpenXMLExtended.COMPANY);
+        ep.put("DocSecurity", OfficeOpenXMLExtended.DOC_SECURITY);
+        ep.put("HiddenSlides", OfficeOpenXMLExtended.HIDDEN_SLIDES);
+        ep.put("Manager", OfficeOpenXMLExtended.MANAGER);
+        ep.put("Notes", OfficeOpenXMLExtended.NOTES);
+        ep.put("PresentationFormat", OfficeOpenXMLExtended.PRESENTATION_FORMAT);
+        ep.put("Template", OfficeOpenXMLExtended.TEMPLATE);
+        ep.put("TotalTime", OfficeOpenXMLExtended.TOTAL_TIME);
+        ep.put("Pages", Office.PAGE_COUNT);
+        ep.put("Words", Office.WORD_COUNT);
+        ep.put("Characters", Office.CHARACTER_COUNT);
+        ep.put("CharactersWithSpaces", Office.CHARACTER_COUNT_WITH_SPACES);
+        ep.put("Paragraphs", Office.PARAGRAPH_COUNT);
+        ep.put("Lines", Office.LINE_COUNT);
+        properties.put(EP_NS, ep);
+    }
+
+    @Override
+    public String getContentType() {
+        return "application/vnd.openxmlformats-officedocument.extended-properties+xml";
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/PartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/PartHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/PartHandler.java
new file mode 100644
index 0000000..fee64de
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/PartHandler.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+interface PartHandler extends ContentHandler {
+    void setName(String name);
+
+    String getName();
+
+    String getContentType();
+
+    /**
+     * Override this to flush buffers, etc if necessary
+     */
+    void endPart() throws SAXException, TikaException;
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Relationship.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Relationship.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Relationship.java
new file mode 100644
index 0000000..eccb1bf
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Relationship.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+
+import org.apache.poi.openxml4j.opc.TargetMode;
+
+class Relationship {
+
+    private final String contentType;
+
+    private final String target;
+
+    private final TargetMode targetMode;
+
+    public Relationship(String contentType, String target) {
+        this(contentType, target, null);
+    }
+
+    public Relationship(String contentType, String target, TargetMode targetMode) {
+        this.contentType = contentType;
+        this.target = target;
+        this.targetMode = targetMode;
+    }
+
+    public String getContentType() {
+        return contentType;
+    }
+
+    public String getTarget() {
+        return target;
+    }
+
+    public TargetMode getTargetMode() {
+        return targetMode;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/RelationshipsHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/RelationshipsHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/RelationshipsHandler.java
new file mode 100644
index 0000000..670ffab
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/RelationshipsHandler.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+
+import org.apache.poi.openxml4j.opc.ContentTypes;
+import org.apache.poi.openxml4j.opc.TargetMode;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+class RelationshipsHandler extends AbstractPartHandler {
+
+    final static String REL_NS = "http://schemas.openxmlformats.org/package/2006/relationships";
+
+    private final RelationshipsManager relationshipsManager;
+
+    public RelationshipsHandler(RelationshipsManager relationshipsManager) {
+        this.relationshipsManager = relationshipsManager;
+    }
+
+
+    @Override
+    public void startDocument() throws SAXException {
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws SAXException {
+    }
+
+    @Override
+    public void endPrefixMapping(String prefix) throws SAXException {
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+        if (uri.equals(REL_NS)) {
+            if (localName.equals("Relationship")) {
+                String id = atts.getValue("", "Id");
+                String type = atts.getValue("", "Type");
+                String target = atts.getValue("", "Target");
+                String targetModeString = atts.getValue("", "TargetMode");
+                TargetMode targetMode = "EXTERNAL".equals(targetModeString)? TargetMode.EXTERNAL :
+                        TargetMode.INTERNAL;
+                relationshipsManager.addRelationship(getName(), id, type, target, targetMode);
+            }
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+
+    }
+
+    @Override
+    public String getContentType() {
+        return ContentTypes.RELATIONSHIPS_PART;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/RelationshipsManager.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/RelationshipsManager.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/RelationshipsManager.java
new file mode 100644
index 0000000..5773fbb
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/RelationshipsManager.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.poi.openxml4j.opc.TargetMode;
+
+class RelationshipsManager {
+
+    Map<String, Map<String, Relationship>> map = new HashMap<>();
+
+    public void addRelationship(String relsFileName, String id, String type, String target, TargetMode targetMode) {
+        String packageName = convertRelsFileNameToPackageName(relsFileName);
+        Map<String, Relationship> thisPackageRels = map.get(packageName);
+        if (thisPackageRels == null) {
+            thisPackageRels = new HashMap<>();
+        }
+        thisPackageRels.put(id, new Relationship(type, target, targetMode));
+        map.put(packageName, thisPackageRels);
+    }
+
+    public Relationship getRelationship(String packageName, String id) {
+        Map<String, Relationship> thisPackageRels = map.get(packageName);
+        if (thisPackageRels != null) {
+            return thisPackageRels.get(id);
+        }
+        return null;
+    }
+
+    private String convertRelsFileNameToPackageName(String relsFileName) {
+        if ("/_rels/.rels".equals(relsFileName)) {
+            return "/";
+        }
+
+        String tmp = relsFileName;
+        tmp = tmp.replaceFirst("\\/_rels\\/", "/");
+        tmp = tmp.replaceFirst(".rels\\Z", "");
+        return tmp;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java
new file mode 100644
index 0000000..4276671
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+class Word2006MLDocHandler extends DefaultHandler {
+
+    final static String PKG_NS = "http://schemas.microsoft.com/office/2006/xmlPackage";
+
+
+    private final XHTMLContentHandler xhtml;
+    private final Metadata metadata;
+    private final ParseContext parseContext;
+
+    private final Map<String, PartHandler> partHandlers = new HashMap<>();
+    private final BinaryDataHandler binaryDataHandler;
+    private final RelationshipsManager relationshipsManager = new RelationshipsManager();
+    private PartHandler currentPartHandler = null;
+
+    public Word2006MLDocHandler(XHTMLContentHandler xhtml, Metadata metadata,
+                                ParseContext context) {
+        this.xhtml = xhtml;
+        this.metadata = metadata;
+        this.parseContext = context;
+        OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);
+
+        addPartHandler(new RelationshipsHandler(relationshipsManager));
+
+        addPartHandler(new BodyPartHandler(
+                XWPFRelation.DOCUMENT.getContentType(),
+                xhtml, relationshipsManager, officeParserConfig));
+
+        addPartHandler(new BodyPartHandler(
+                XWPFRelation.FOOTNOTE.getContentType(),
+                xhtml, relationshipsManager, officeParserConfig));
+
+        addPartHandler(new BodyPartHandler(
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml",
+                xhtml, relationshipsManager, officeParserConfig));
+
+        addPartHandler(new BodyPartHandler(
+                XWPFRelation.HEADER.getContentType(),
+                xhtml, relationshipsManager, officeParserConfig));
+
+        addPartHandler(new BodyPartHandler(
+                XWPFRelation.FOOTER.getContentType(),
+                xhtml, relationshipsManager, officeParserConfig));
+
+        addPartHandler(new BodyPartHandler(
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml",
+                xhtml, relationshipsManager, officeParserConfig));
+
+
+        addPartHandler(new BodyPartHandler(
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml",
+                xhtml, relationshipsManager, officeParserConfig));
+
+        addPartHandler(new BodyPartHandler(
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml",
+                xhtml, relationshipsManager, officeParserConfig));
+
+        addPartHandler(new CorePropertiesHandler(metadata));
+        addPartHandler(new ExtendedPropertiesHandler(metadata));
+        binaryDataHandler = new BinaryDataHandler(xhtml, metadata, context);
+    }
+
+    private void addPartHandler(PartHandler partHandler) {
+        partHandlers.put(partHandler.getContentType(), partHandler);
+    }
+
+
+    @Override
+    public void startDocument() throws SAXException {
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws SAXException {
+    }
+
+    @Override
+    public void endPrefixMapping(String prefix) throws SAXException {
+
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+        if (uri.equals(PKG_NS) && localName.equals("part")) {
+            //start of a package
+            String name = atts.getValue(PKG_NS, "name");
+            String contentType = atts.getValue(PKG_NS, "contentType");
+            currentPartHandler = partHandlers.get(contentType);
+            //for now treat every unknown part type
+            //as if it contained binary data
+            if (currentPartHandler == null) {
+                currentPartHandler = binaryDataHandler;
+            }
+            if (currentPartHandler != null) {
+                currentPartHandler.setName(name);
+            }
+        } else if (currentPartHandler != null) {
+            currentPartHandler.startElement(uri, localName, qName, atts);
+        }
+
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+        if (uri.equals(PKG_NS) && localName.equals("part")) {
+            //do post processing
+            if (currentPartHandler != null) {
+                try {
+                    currentPartHandler.endPart();
+                } catch (TikaException e) {
+                    throw new SAXException(e);
+                }
+            }
+            //then reset
+            currentPartHandler = null;
+        } else if (currentPartHandler != null) {
+            currentPartHandler.endElement(uri, localName, qName);
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+        if (currentPartHandler != null) {
+            currentPartHandler.characters(ch, start, length);
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+        if (currentPartHandler != null) {
+            currentPartHandler.characters(ch, start, length);
+        }
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParser.java
new file mode 100644
index 0000000..ff8a43d
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParser.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.AbstractOfficeParser;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+
+public class Word2006MLParser extends AbstractOfficeParser {
+
+    protected static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(
+                    MediaType.application("vnd.ms-word2006ml"));
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+                      Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+        //set OfficeParserConfig if the user hasn't specified one
+        configure(context);
+
+        final XHTMLContentHandler xhtml =
+                new XHTMLContentHandler(handler, metadata);
+
+        xhtml.startDocument();
+
+        try {
+            context.getSAXParser().parse(
+                    new CloseShieldInputStream(stream),
+                    new OfflineContentHandler(new EmbeddedContentHandler(
+                            new Word2006MLDocHandler(xhtml, metadata, context))));
+        } catch (SAXException e) {
+            throw new TikaException("XML parse error", e);
+        } finally {
+            xhtml.endDocument();
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index f492e89..7aa2b01 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -21,7 +21,7 @@ org.apache.tika.parser.microsoft.OldExcelParser
 org.apache.tika.parser.microsoft.TNEFParser
 org.apache.tika.parser.microsoft.MSOwnerFileParser
 org.apache.tika.parser.microsoft.ooxml.OOXMLParser
-org.apache.tika.parser.microsoft.ooxml.xwpf.Word2006MLParser
+org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006.Word2006MLParser
 org.apache.tika.parser.microsoft.xml.WordMLParser
 org.apache.tika.parser.microsoft.xml.SpreadsheetMLParser
 #org.apache.tika.parser.odf.OpenDocumentContentParser

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index d924f41..ea936d8 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -25,10 +25,12 @@ import javax.xml.transform.sax.SAXTransformerFactory;
 import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
 import java.io.ByteArrayOutputStream;
+import java.io.File;
 import java.io.InputStream;
 import java.io.PrintStream;
 import java.io.StringWriter;
 import java.util.Arrays;
+import java.util.Date;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
@@ -44,10 +46,12 @@ import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.TikaMetadataKeys;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.microsoft.WordParserTest;
 import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Ignore;
@@ -1299,6 +1303,33 @@ public class OOXMLParserTest extends TikaTest {
         assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xlsm"));
     }
 
+    //@Test //use this for lightweight benchmarking to compare xwpf options
+    public void testBatch() throws Exception {
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setUseSAXDocxExtractor(true);
+        long started = new Date().getTime();
+        int ex = 0;
+        for (int i = 0; i < 100; i++) {
+            for (File f : getResourceAsFile("/test-documents").listFiles()) {
+                if (!f.getName().endsWith(".docx")) {
+                    continue;
+                }
+                try (InputStream is = TikaInputStream.get(f)) {
+                    ParseContext parseContext = new ParseContext();
+                    parseContext.set(OfficeParserConfig.class, officeParserConfig);
+                    //test only the extraction of the main docx content, not embedded docs
+                    parseContext.set(Parser.class, new EmptyParser());
+                    Metadata metadata = new Metadata();
+                    XMLResult r = getXML(is, parser, metadata, parseContext);
+                } catch (Exception e) {
+                    ex++;
+
+                }
+            }
+        }
+        System.out.println("elapsed: "+(new Date().getTime()-started) + " with " + ex + " exceptions");
+    }
+
 }
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java
deleted file mode 100644
index 607e6ef..0000000
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-import static org.junit.Assert.assertEquals;
-
-import java.util.List;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.RecursiveParserWrapper;
-import org.apache.tika.parser.microsoft.MSOfficeParserConfig;
-import org.junit.Test;
-
-
-public class Word2006MLParserTest extends TikaTest {
-
-    @Test
-    public void basicTest() throws Exception {
-
-
-
-        List<Metadata> metadataList = getRecursiveMetadata("testWORD_2006ml.xml");
-
-        assertEquals(5, metadataList.size());
-
-        Metadata m = metadataList.get(0);
-
-        assertEquals("2016-11-23T12:07:00Z", m.get(TikaCoreProperties.CREATED));
-        assertEquals("2016-11-23T12:07:00Z", m.get(TikaCoreProperties.MODIFIED));
-        assertEquals("My Document Title", m.get(TikaCoreProperties.TITLE));
-        assertEquals("This is the Author", m.get(TikaCoreProperties.CREATOR));
-        assertEquals("2", m.get(OfficeOpenXMLCore.REVISION));
-        assertEquals("Allison, Timothy B.", m.get(OfficeOpenXMLCore.LAST_MODIFIED_BY));
-        assertEquals("0", m.get(OfficeOpenXMLExtended.DOC_SECURITY));
-        assertEquals("225", m.get(Office.WORD_COUNT));
-        assertEquals("3", m.get(Office.PARAGRAPH_COUNT));
-        assertEquals("1506", m.get(Office.CHARACTER_COUNT_WITH_SPACES));
-        assertEquals("10", m.get(Office.LINE_COUNT));
-        assertEquals("16.0000", m.get(OfficeOpenXMLExtended.APP_VERSION));
-
-
-        String content = m.get(RecursiveParserWrapper.TIKA_CONTENT);
-
-
-        assertContainsCountTimes("engaging title page", content, 1);
-        assertContainsCountTimes("<p>This is the Author</p>", content, 1);
-        assertContainsCountTimes("<p>This is an engaging title page</p>", content, 1);
-
-        assertContains("<p>My Document Title</p>", content);
-        assertContains("<p>My Document Subtitle</p>", content);
-
-        assertContains("<p>\tHeading1\t3</p>", content);
-
-
-        //TODO: integrate numbering
-        assertContains("Really basic 2.", content);
-
-        assertContainsCountTimes("This is a text box", content, 1);
-
-        assertContains("<p>This is a hyperlink: <a href=\"http://tika.apache.org\">tika</a></p>", content);
-
-        assertContains("<p>This is a link to a local file: <a href=\"file:///C:\\data\\test.png\">test.png</a></p>", content);
-
-        assertContains("<p>This is          10 spaces</p>", content);
-
-        //caption
-        assertContains("<p>Table 1: Table1 Caption</p>", content);
-
-        //embedded table
-        //TODO: figure out how to handle embedded tables in html
-        assertContains("<p>Embedded table r1c1</p>", content);
-
-        //shape
-        assertContainsCountTimes("<p>This is text within a shape", content, 1);
-
-        //sdt rich text
-        assertContains("<p>Rich text content control", content);
-
-        //sdt simple text
-        assertContains("<p>Simple text content control", content);
-
-        //sdt repeating
-        assertContains("Repeating content", content);
-
-        //sdt dropdown
-        //TODO: get options for dropdown
-        assertContains("Drop down1", content);
-
-        //sdt date
-        assertContains("<p>11/16/2016</p>", content);
-
-        //test that <tab/> works
-        assertContains("tab\ttab", content);
-
-        assertContainsCountTimes("serious word art", content, 1);
-        assertContainsCountTimes("Wordartr1c1", content, 1);
-
-        //glossary document contents
-        assertContains("Click or tap to enter a date", content);
-
-        //basic formatting
-        assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over",
-                content);
-
-        //TODO: add chart parsing
-//        assertContains("This is the chart", content);
-
-        assertContains("This is a comment", content);
-
-        assertContains("This is an endnote", content);
-
-        assertContains("this is the footnote", content);
-
-        assertContains("First page header", content);
-
-        assertContains("Even page header", content);
-
-        assertContains("Odd page header", content);
-
-        assertContains("First page footer", content);
-
-        assertContains("Even page footer", content);
-
-        assertContains("Odd page footer", content);
-
-        //test default includes deleted
-        assertContains("frog", content);
-
-        assertContains("Mattmann", content);
-
-        //TODO: extract this...Note that it is in "Backup" not "Choice"!!!
-//        assertContains("This is the chart title", content);
-
-
-
-    }
-
-    private void assertContainsCountTimes(String needle, String haystack, int expectedCount) {
-        int i = haystack.indexOf("engaging title page");
-        int cnt = 0;
-        while (i > -1) {
-            cnt++;
-            i = haystack.indexOf("engaging title page", i+1);
-        }
-        assertEquals("found needle >"+ needle+"<"+cnt+" times instead of expected: "+expectedCount,
-                expectedCount, cnt);
-
-    }
-
-    @Test
-    public void testSkipDeleted() throws Exception {
-        ParseContext pc = new ParseContext();
-        MSOfficeParserConfig msOfficeParserConfig = new MSOfficeParserConfig();
-        msOfficeParserConfig.setIncludeDeletedContent(false);
-        pc.set(MSOfficeParserConfig.class, msOfficeParserConfig);
-
-        XMLResult r = getXML("testWORD_2006ml.xml", pc);
-        assertNotContained("frog", r.xml);
-    }
-
-}