You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC
svn commit: r1723223 [11/32] - in /tika/branches/2.x: tika-core/src/test/resources/META-INF/ tika-core/src/test/resources/META-INF/services/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-module/src/ ti...

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,711 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFOldDocument;
+import org.apache.poi.hwpf.OldWordFileFormatException;
+import org.apache.poi.hwpf.extractor.Word6Extractor;
+import org.apache.poi.hwpf.model.FieldsDocumentPart;
+import org.apache.poi.hwpf.model.PicturesTable;
+import org.apache.poi.hwpf.model.StyleDescription;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Field;
+import org.apache.poi.hwpf.usermodel.HeaderStories;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Picture;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.hwpf.usermodel.Table;
+import org.apache.poi.hwpf.usermodel.TableCell;
+import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class WordExtractor extends AbstractPOIFSExtractor {
+
+    private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
+    private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
+    // could be improved by using the real delimiter in xchFollow [MS-DOC], v20140721, 2.4.6.3, Part 3, Step 3
+    private static final String LIST_DELIMITER = " ";
+    private static final Map<String, TagAndStyle> fixedParagraphStyles = new HashMap<String, TagAndStyle>();
+    private static final TagAndStyle defaultParagraphStyle = new TagAndStyle("p", null);
+
+    static {
+        fixedParagraphStyles.put("Default", defaultParagraphStyle);
+        fixedParagraphStyles.put("Normal", defaultParagraphStyle);
+        fixedParagraphStyles.put("heading", new TagAndStyle("h1", null));
+        fixedParagraphStyles.put("Heading", new TagAndStyle("h1", null));
+        fixedParagraphStyles.put("Title", new TagAndStyle("h1", "title"));
+        fixedParagraphStyles.put("Subtitle", new TagAndStyle("h2", "subtitle"));
+        fixedParagraphStyles.put("HTML Preformatted", new TagAndStyle("pre", null));
+    }
+
+    // True if we are currently in the named style tag:
+    private boolean curStrikeThrough;
+    private boolean curBold;
+    private boolean curItalic;
+
+    public WordExtractor(ParseContext context) {
+        super(context);
+    }
+
+    private static int countParagraphs(Range... ranges) {
+        int count = 0;
+        for (Range r : ranges) {
+            if (r != null) {
+                count += r.numParagraphs();
+            }
+        }
+        return count;
+    }
+
+    /**
+     * Given a style name, return what tag should be used, and
+     * what style should be applied to it.
+     */
+    public static TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable) {
+        TagAndStyle tagAndStyle = fixedParagraphStyles.get(styleName);
+        if (tagAndStyle != null) {
+            return tagAndStyle;
+        }
+
+        if (styleName.equals("Table Contents") && isTable) {
+            return defaultParagraphStyle;
+        }
+
+        String tag = "p";
+        String styleClass = null;
+
+        if (styleName.startsWith("heading") || styleName.startsWith("Heading")) {
+            // "Heading 3" or "Heading2" or "heading 4"
+            int num = 1;
+            try {
+                num = Integer.parseInt(
+                        styleName.substring(styleName.length() - 1)
+                );
+            } catch (NumberFormatException e) {
+            }
+            // Turn it into a H1 - H6 (H7+ isn't valid!)
+            tag = "h" + Math.min(num, 6);
+        } else {
+            styleClass = styleName.replace(' ', '_');
+            styleClass = styleClass.substring(0, 1).toLowerCase(Locale.ROOT) +
+                    styleClass.substring(1);
+        }
+
+        return new TagAndStyle(tag, styleClass);
+    }
+
+    protected void parse(
+            NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+        parse(filesystem.getRoot(), xhtml);
+    }
+
+    protected void parse(
+            DirectoryNode root, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+        HWPFDocument document;
+        try {
+            document = new HWPFDocument(root);
+        } catch (OldWordFileFormatException e) {
+            parseWord6(root, xhtml);
+            return;
+        }
+        org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
+                new org.apache.poi.hwpf.extractor.WordExtractor(document);
+        HeaderStories headerFooter = new HeaderStories(document);
+
+        // Grab the list of pictures. As far as we can tell,
+        //  the pictures should be in order, and may be directly
+        //  placed or referenced from an anchor
+        PicturesTable pictureTable = document.getPicturesTable();
+        PicturesSource pictures = new PicturesSource(document);
+
+        // Do any headers, if present
+        Range[] headers = new Range[]{headerFooter.getFirstHeaderSubrange(),
+                headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange()};
+        handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);
+
+        // Do the main paragraph text
+        Range r = document.getRange();
+        ListManager listManager = new ListManager(document);
+        for (int i = 0; i < r.numParagraphs(); i++) {
+            Paragraph p = r.getParagraph(i);
+            i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml);
+        }
+
+        // Do everything else
+        for (String paragraph : wordExtractor.getMainTextboxText()) {
+            xhtml.element("p", paragraph);
+        }
+
+        for (String paragraph : wordExtractor.getFootnoteText()) {
+            xhtml.element("p", paragraph);
+        }
+
+        for (String paragraph : wordExtractor.getCommentsText()) {
+            xhtml.element("p", paragraph);
+        }
+
+        for (String paragraph : wordExtractor.getEndnoteText()) {
+            xhtml.element("p", paragraph);
+        }
+
+        // Do any footers, if present
+        Range[] footers = new Range[]{headerFooter.getFirstFooterSubrange(),
+                headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange()};
+        handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);
+
+        // Handle any pictures that we haven't output yet
+        for (Picture p = pictures.nextUnclaimed(); p != null; ) {
+            handlePictureCharacterRun(
+                    null, p, pictures, xhtml
+            );
+            p = pictures.nextUnclaimed();
+        }
+
+        // Handle any embeded office documents
+        try {
+            DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
+            for (Entry entry : op) {
+                if (entry.getName().startsWith("_")
+                        && entry instanceof DirectoryEntry) {
+                    handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
+                }
+            }
+        } catch (FileNotFoundException e) {
+        }
+    }
+
+    private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument document,
+                                    PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml)
+            throws SAXException, IOException, TikaException {
+        if (countParagraphs(ranges) > 0) {
+            xhtml.startElement("div", "class", type);
+            ListManager listManager = new ListManager(document);
+            for (Range r : ranges) {
+                if (r != null) {
+                    for (int i = 0; i < r.numParagraphs(); i++) {
+                        Paragraph p = r.getParagraph(i);
+
+                        i += handleParagraph(p, 0, r, document,
+                                FieldsDocumentPart.HEADER, pictures, pictureTable, listManager, xhtml);
+                    }
+                }
+            }
+            xhtml.endElement("div");
+        }
+    }
+
+    private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document,
+                                FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, ListManager listManager,
+                                XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
+        // Note - a poi bug means we can't currently properly recurse
+        //  into nested tables, so currently we don't
+        if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) {
+            Table t = r.getTable(p);
+            xhtml.startElement("table");
+            xhtml.startElement("tbody");
+            for (int rn = 0; rn < t.numRows(); rn++) {
+                TableRow row = t.getRow(rn);
+                xhtml.startElement("tr");
+                for (int cn = 0; cn < row.numCells(); cn++) {
+                    TableCell cell = row.getCell(cn);
+                    xhtml.startElement("td");
+
+                    for (int pn = 0; pn < cell.numParagraphs(); pn++) {
+                        Paragraph cellP = cell.getParagraph(pn);
+                        handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, listManager, xhtml);
+                    }
+                    xhtml.endElement("td");
+                }
+                xhtml.endElement("tr");
+            }
+            xhtml.endElement("tbody");
+            xhtml.endElement("table");
+            return (t.numParagraphs() - 1);
+        }
+
+        String text = p.text();
+        if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
+            // Skip empty paragraphs
+            return 0;
+        }
+
+        TagAndStyle tas;
+        String numbering = null;
+
+        if (document.getStyleSheet().numStyles() > p.getStyleIndex()) {
+            StyleDescription style =
+                    document.getStyleSheet().getStyleDescription(p.getStyleIndex());
+            if (style != null && style.getName() != null && style.getName().length() > 0) {
+                if (p.isInList()) {
+                    numbering = listManager.getFormattedNumber(p);
+                }
+                tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0));
+            } else {
+                tas = new TagAndStyle("p", null);
+            }
+        } else {
+            tas = new TagAndStyle("p", null);
+        }
+
+        if (tas.getStyleClass() != null) {
+            xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
+        } else {
+            xhtml.startElement(tas.getTag());
+        }
+
+        if (numbering != null) {
+            xhtml.characters(numbering);
+        }
+
+        for (int j = 0; j < p.numCharacterRuns(); j++) {
+            CharacterRun cr = p.getCharacterRun(j);
+
+            // FIELD_BEGIN_MARK:
+            if (cr.text().getBytes(UTF_8)[0] == 0x13) {
+                Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset());
+                // 58 is an embedded document
+                // 56 is a document link
+                if (field != null && (field.getType() == 58 || field.getType() == 56)) {
+                    // Embedded Object: add a <div
+                    // class="embedded" id="_X"/> so consumer can see where
+                    // in the main text each embedded document
+                    // occurred:
+                    String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset();
+                    AttributesImpl attributes = new AttributesImpl();
+                    attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+                    attributes.addAttribute("", "id", "id", "CDATA", id);
+                    xhtml.startElement("div", attributes);
+                    xhtml.endElement("div");
+                }
+            }
+
+            if (cr.text().equals("\u0013")) {
+                j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
+            } else if (cr.text().startsWith("\u0008")) {
+                // Floating Picture(s)
+                for (int pn = 0; pn < cr.text().length(); pn++) {
+                    // Assume they're in the order from the unclaimed list...
+                    Picture picture = pictures.nextUnclaimed();
+
+                    // Output
+                    handlePictureCharacterRun(cr, picture, pictures, xhtml);
+                }
+            } else if (pictureTable.hasPicture(cr)) {
+                // Inline Picture
+                Picture picture = pictures.getFor(cr);
+                handlePictureCharacterRun(cr, picture, pictures, xhtml);
+            } else {
+                handleCharacterRun(cr, tas.isHeading(), xhtml);
+            }
+        }
+
+        // Close any still open style tags
+        if (curStrikeThrough) {
+            xhtml.endElement("s");
+            curStrikeThrough = false;
+        }
+        if (curItalic) {
+            xhtml.endElement("i");
+            curItalic = false;
+        }
+        if (curBold) {
+            xhtml.endElement("b");
+            curBold = false;
+        }
+
+        xhtml.endElement(tas.getTag());
+
+        return 0;
+    }
+
+    private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLContentHandler xhtml)
+            throws SAXException {
+        // Skip trailing newlines
+        if (!isRendered(cr) || cr.text().equals("\r"))
+            return;
+
+        if (!skipStyling) {
+            if (cr.isBold() != curBold) {
+                // Enforce nesting -- must close s and i tags
+                if (curStrikeThrough) {
+                    xhtml.endElement("s");
+                    curStrikeThrough = false;
+                }
+                if (curItalic) {
+                    xhtml.endElement("i");
+                    curItalic = false;
+                }
+                if (cr.isBold()) {
+                    xhtml.startElement("b");
+                } else {
+                    xhtml.endElement("b");
+                }
+                curBold = cr.isBold();
+            }
+
+            if (cr.isItalic() != curItalic) {
+                // Enforce nesting -- must close s tag
+                if (curStrikeThrough) {
+                    xhtml.endElement("s");
+                    curStrikeThrough = false;
+                }
+                if (cr.isItalic()) {
+                    xhtml.startElement("i");
+                } else {
+                    xhtml.endElement("i");
+                }
+                curItalic = cr.isItalic();
+            }
+
+            if (cr.isStrikeThrough() != curStrikeThrough) {
+                if (cr.isStrikeThrough()) {
+                    xhtml.startElement("s");
+                } else {
+                    xhtml.endElement("s");
+                }
+                curStrikeThrough = cr.isStrikeThrough();
+            }
+        }
+
+        // Clean up the text
+        String text = cr.text();
+        text = text.replace('\r', '\n');
+        if (text.endsWith("\u0007")) {
+            // Strip the table cell end marker
+            text = text.substring(0, text.length() - 1);
+        }
+
+        // Copied from POI's org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters:
+
+        // Non-breaking hyphens are returned as char 30
+        text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN);
+
+        // Non-required hyphens to zero-width space
+        text = text.replace((char) 31, UNICODECHAR_ZERO_WIDTH_SPACE);
+
+        // Control characters as line break
+        text = text.replaceAll("[\u0000-\u001f]", "\n");
+        xhtml.characters(text);
+    }
+
+    /**
+     * Can be \13..text..\15 or \13..control..\14..text..\15 .
+     * Nesting is allowed
+     */
+    private int handleSpecialCharacterRuns(Paragraph p, int index, boolean skipStyling,
+                                           PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException, TikaException, IOException {
+        List<CharacterRun> controls = new ArrayList<CharacterRun>();
+        List<CharacterRun> texts = new ArrayList<CharacterRun>();
+        boolean has14 = false;
+
+        // Split it into before and after the 14
+        int i;
+        for (i = index + 1; i < p.numCharacterRuns(); i++) {
+            CharacterRun cr = p.getCharacterRun(i);
+            if (cr.text().equals("\u0013")) {
+                // Nested, oh joy...
+                int increment = handleSpecialCharacterRuns(p, i + 1, skipStyling, pictures, xhtml);
+                i += increment;
+            } else if (cr.text().equals("\u0014")) {
+                has14 = true;
+            } else if (cr.text().equals("\u0015")) {
+                if (!has14) {
+                    texts = controls;
+                    controls = new ArrayList<CharacterRun>();
+                }
+                break;
+            } else {
+                if (has14) {
+                    texts.add(cr);
+                } else {
+                    controls.add(cr);
+                }
+            }
+        }
+
+        // Do we need to do something special with this?
+        if (controls.size() > 0) {
+            String text = controls.get(0).text();
+            for (int j = 1; j < controls.size(); j++) {
+                text += controls.get(j).text();
+            }
+
+            if ((text.startsWith("HYPERLINK") || text.startsWith(" HYPERLINK"))
+                    && text.indexOf('"') > -1) {
+                int start = text.indexOf('"') + 1;
+                int end = findHyperlinkEnd(text, start);
+                String url = "";
+                if (start >= 0 && start < end && end <= text.length()) {
+                    url = text.substring(start, end);
+                }
+
+                xhtml.startElement("a", "href", url);
+                for (CharacterRun cr : texts) {
+                    handleCharacterRun(cr, skipStyling, xhtml);
+                }
+                xhtml.endElement("a");
+            } else {
+                // Just output the text ones
+                for (CharacterRun cr : texts) {
+                    if (pictures.hasPicture(cr)) {
+                        Picture picture = pictures.getFor(cr);
+                        handlePictureCharacterRun(cr, picture, pictures, xhtml);
+                    } else {
+                        handleCharacterRun(cr, skipStyling, xhtml);
+                    }
+                }
+            }
+        } else {
+            // We only had text
+            // Output as-is
+            for (CharacterRun cr : texts) {
+                handleCharacterRun(cr, skipStyling, xhtml);
+            }
+        }
+
+        // Tell them how many to skip over
+        return i - index;
+    }
+
+    //temporary work around for TIKA-1512
+    private int findHyperlinkEnd(String text, int start) {
+        int end = text.lastIndexOf('"');
+        if (end > start) {
+            return end;
+        }
+        end = text.lastIndexOf('\u201D');//smart right double quote
+        if (end > start) {
+            return end;
+        }
+        end = text.lastIndexOf('\r');
+        if (end > start) {
+            return end;
+        }
+        //if nothing so far, take the full length of the string
+        //If the full string is > 256 characters, it appears
+        //that the url is truncated in the .doc file.  This
+        //will return the value as it is in the file, which
+        //may be incorrect; but it is the same behavior as opening
+        //the link in MSWord.
+        //This code does not currently check that length is actually >= 256.
+        //we might want to add that?
+        return text.length();
+    }
+
+    private void handlePictureCharacterRun(CharacterRun cr, Picture picture, PicturesSource pictures, XHTMLContentHandler xhtml)
+            throws SAXException, IOException, TikaException {
+        if (!isRendered(cr) || picture == null) {
+            // Oh dear, we've run out...
+            // Probably caused by multiple \u0008 images referencing
+            //  the same real image
+            return;
+        }
+
+        // Which one is it?
+        String extension = picture.suggestFileExtension();
+        int pictureNumber = pictures.pictureNumber(picture);
+
+        // Make up a name for the picture
+        // There isn't one in the file, but we need to be able to reference
+        //  the picture from the img tag and the embedded resource
+        String filename = "image" + pictureNumber + (extension.length() > 0 ? "." + extension : "");
+
+        // Grab the mime type for the picture
+        String mimeType = picture.getMimeType();
+
+        // Output the img tag
+        AttributesImpl attr = new AttributesImpl();
+        attr.addAttribute("", "src", "src", "CDATA", "embedded:" + filename);
+        attr.addAttribute("", "alt", "alt", "CDATA", filename);
+        xhtml.startElement("img", attr);
+        xhtml.endElement("img");
+
+        // Have we already output this one?
+        // (Only expose each individual image once)
+        if (!pictures.hasOutput(picture)) {
+            TikaInputStream stream = TikaInputStream.get(picture.getContent());
+            handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false);
+            pictures.recordOutput(picture);
+        }
+    }
+
+    /**
+     * Outputs a section of text if the given text is non-empty.
+     *
+     * @param xhtml   XHTML content handler
+     * @param section the class of the &lt;div/&gt; section emitted
+     * @param text    text to be emitted, if any
+     * @throws SAXException if an error occurs
+     */
+    private void addTextIfAny(
+            XHTMLContentHandler xhtml, String section, String text)
+            throws SAXException {
+        if (text != null && text.length() > 0) {
+            xhtml.startElement("div", "class", section);
+            xhtml.element("p", text);
+            xhtml.endElement("div");
+        }
+    }
+
+    protected void parseWord6(
+            NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+        parseWord6(filesystem.getRoot(), xhtml);
+    }
+
+    protected void parseWord6(
+            DirectoryNode root, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+        HWPFOldDocument doc = new HWPFOldDocument(root);
+        Word6Extractor extractor = new Word6Extractor(doc);
+
+        for (String p : extractor.getParagraphText()) {
+            xhtml.element("p", p);
+        }
+    }
+
+    /**
+     * Determines if character run should be included in the extraction.
+     *
+     * @param cr character run.
+     * @return true if character run should be included in extraction.
+     */
+    private boolean isRendered(final CharacterRun cr) {
+        return cr == null || !cr.isMarkedDeleted();
+    }
+
+    public static class TagAndStyle {
+        private String tag;
+        private String styleClass;
+
+        public TagAndStyle(String tag, String styleClass) {
+            this.tag = tag;
+            this.styleClass = styleClass;
+        }
+
+        public String getTag() {
+            return tag;
+        }
+
+        public String getStyleClass() {
+            return styleClass;
+        }
+
+        public boolean isHeading() {
+            return tag.length() == 2 && tag.startsWith("h");
+        }
+    }
+
+    /**
+     * Provides access to the pictures both by offset, iteration
+     * over the un-claimed, and peeking forward
+     */
+    private static class PicturesSource {
+        private PicturesTable picturesTable;
+        private Set<Picture> output = new HashSet<Picture>();
+        private Map<Integer, Picture> lookup;
+        private List<Picture> nonU1based;
+        private List<Picture> all;
+        private int pn = 0;
+
+        private PicturesSource(HWPFDocument doc) {
+            picturesTable = doc.getPicturesTable();
+            all = picturesTable.getAllPictures();
+
+            // Build the Offset-Picture lookup map
+            lookup = new HashMap<Integer, Picture>();
+            for (Picture p : all) {
+                lookup.put(p.getStartOffset(), p);
+            }
+
+            // Work out which Pictures aren't referenced by
+            //  a \u0001 in the main text
+            // These are \u0008 escher floating ones, ones
+            //  found outside the normal text, and who
+            //  knows what else...
+            nonU1based = new ArrayList<Picture>();
+            nonU1based.addAll(all);
+            Range r = doc.getRange();
+            for (int i = 0; i < r.numCharacterRuns(); i++) {
+                CharacterRun cr = r.getCharacterRun(i);
+                if (picturesTable.hasPicture(cr)) {
+                    Picture p = getFor(cr);
+                    int at = nonU1based.indexOf(p);
+                    nonU1based.set(at, null);
+                }
+            }
+        }
+
+        private boolean hasPicture(CharacterRun cr) {
+            return picturesTable.hasPicture(cr);
+        }
+
+        private void recordOutput(Picture picture) {
+            output.add(picture);
+        }
+
+        private boolean hasOutput(Picture picture) {
+            return output.contains(picture);
+        }
+
+        private int pictureNumber(Picture picture) {
+            return all.indexOf(picture) + 1;
+        }
+
+        private Picture getFor(CharacterRun cr) {
+            return lookup.get(cr.getPicOffset());
+        }
+
+        /**
+         * Return the next unclaimed one, used towards
+         * the end
+         */
+        private Picture nextUnclaimed() {
+            Picture p = null;
+            while (pn < nonU1based.size()) {
+                p = nonU1based.get(pn);
+                pn++;
+                if (p != null) return p;
+            }
+            return null;
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,312 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.util.List;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
+import org.apache.poi.openxml4j.opc.TargetMode;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Base class for all Tika OOXML extractors.
+ * <p/>
+ * Tika extractors decorate POI extractors so that the parsed content of
+ * documents is returned as a sequence of XHTML SAX events. Subclasses must
+ * implement the buildXHTML method {@link #buildXHTML(XHTMLContentHandler)} that
+ * populates the {@link XHTMLContentHandler} object received as parameter.
+ */
+public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
+    static final String RELATION_AUDIO = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/audio";
+    static final String RELATION_IMAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
+    static final String RELATION_OLE_OBJECT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject";
+    static final String RELATION_PACKAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/package";
+
+    private static final String TYPE_OLE_OBJECT =
+            "application/vnd.openxmlformats-officedocument.oleObject";
+    private final EmbeddedDocumentExtractor embeddedExtractor;
+    protected POIXMLTextExtractor extractor;
+
+    public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) {
+        this.extractor = extractor;
+
+        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+
+        if (ex == null) {
+            embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
+        } else {
+            embeddedExtractor = ex;
+        }
+
+    }
+
+    /**
+     * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getDocument()
+     */
+    public POIXMLDocument getDocument() {
+        return extractor.getDocument();
+    }
+
+    /**
+     * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor()
+     */
+    public MetadataExtractor getMetadataExtractor() {
+        return new MetadataExtractor(extractor);
+    }
+
+    /**
+     * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler,
+     * org.apache.tika.metadata.Metadata)
+     */
+    public void getXHTML(
+            ContentHandler handler, Metadata metadata, ParseContext context)
+            throws SAXException, XmlException, IOException, TikaException {
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        buildXHTML(xhtml);
+
+        // Now do any embedded parts
+        handleEmbeddedParts(handler);
+
+        // thumbnail
+        handleThumbnail(handler);
+
+        xhtml.endDocument();
+    }
+
+    protected String getJustFileName(String desc) {
+        int idx = desc.lastIndexOf('/');
+        if (idx != -1) {
+            desc = desc.substring(idx + 1);
+        }
+        idx = desc.lastIndexOf('.');
+        if (idx != -1) {
+            desc = desc.substring(0, idx);
+        }
+
+        return desc;
+    }
+
+    private void handleThumbnail(ContentHandler handler) {
+        try {
+            OPCPackage opcPackage = extractor.getPackage();
+            for (PackageRelationship rel : opcPackage.getRelationshipsByType(PackageRelationshipTypes.THUMBNAIL)) {
+                PackagePart tPart = opcPackage.getPart(rel);
+                InputStream tStream = tPart.getInputStream();
+                Metadata thumbnailMetadata = new Metadata();
+                String thumbName = tPart.getPartName().getName();
+                thumbnailMetadata.set(Metadata.RESOURCE_NAME_KEY, thumbName);
+
+                AttributesImpl attributes = new AttributesImpl();
+                attributes.addAttribute(XHTML, "class", "class", "CDATA", "embedded");
+                attributes.addAttribute(XHTML, "id", "id", "CDATA", thumbName);
+                handler.startElement(XHTML, "div", "div", attributes);
+                handler.endElement(XHTML, "div", "div");
+
+                thumbnailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, thumbName);
+                thumbnailMetadata.set(Metadata.CONTENT_TYPE, tPart.getContentType());
+                thumbnailMetadata.set(TikaCoreProperties.TITLE, tPart.getPartName().getName());
+
+                if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) {
+                    embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream), new EmbeddedContentHandler(handler), thumbnailMetadata, false);
+                }
+
+                tStream.close();
+            }
+        } catch (Exception ex) {
+
+        }
+    }
+
+    private void handleEmbeddedParts(ContentHandler handler)
+            throws TikaException, IOException, SAXException {
+        try {
+            for (PackagePart source : getMainDocumentParts()) {
+                for (PackageRelationship rel : source.getRelationships()) {
+
+                    URI sourceURI = rel.getSourceURI();
+                    String sourceDesc;
+                    if (sourceURI != null) {
+                        sourceDesc = getJustFileName(sourceURI.getPath());
+                        if (sourceDesc.startsWith("slide")) {
+                            sourceDesc += "_";
+                        } else {
+                            sourceDesc = "";
+                        }
+                    } else {
+                        sourceDesc = "";
+                    }
+                    if (rel.getTargetMode() == TargetMode.INTERNAL) {
+                        PackagePart target;
+
+                        try {
+                            target = source.getRelatedPart(rel);
+                        } catch (IllegalArgumentException ex) {
+                            continue;
+                        }
+
+                        String type = rel.getRelationshipType();
+                        if (RELATION_OLE_OBJECT.equals(type)
+                                && TYPE_OLE_OBJECT.equals(target.getContentType())) {
+                            handleEmbeddedOLE(target, handler, sourceDesc + rel.getId());
+                        } else if (RELATION_AUDIO.equals(type)
+                                || RELATION_IMAGE.equals(type)
+                                || RELATION_PACKAGE.equals(type)
+                                || RELATION_OLE_OBJECT.equals(type)) {
+                            handleEmbeddedFile(target, handler, sourceDesc + rel.getId());
+                        }
+                    }
+                }
+            }
+        } catch (InvalidFormatException e) {
+            throw new TikaException("Broken OOXML file", e);
+        }
+    }
+
+    /**
+     * Handles an embedded OLE object in the document
+     */
+    private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel)
+            throws IOException, SAXException {
+        // A POIFSFileSystem needs to be at least 3 blocks big to be valid
+        if (part.getSize() >= 0 && part.getSize() < 512 * 3) {
+            // Too small, skip
+            return;
+        }
+
+        // Open the POIFS (OLE2) structure and process
+        POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream());
+        try {
+            Metadata metadata = new Metadata();
+            TikaInputStream stream = null;
+            metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
+
+            DirectoryNode root = fs.getRoot();
+            POIFSDocumentType type = POIFSDocumentType.detectType(root);
+
+            if (root.hasEntry("CONTENTS")
+                    && root.hasEntry("\u0001Ole")
+                    && root.hasEntry("\u0001CompObj")
+                    && root.hasEntry("\u0003ObjInfo")) {
+                // TIKA-704: OLE 2.0 embedded non-Office document?
+                stream = TikaInputStream.get(
+                        fs.createDocumentInputStream("CONTENTS"));
+                if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+                    embeddedExtractor.parseEmbedded(
+                            stream, new EmbeddedContentHandler(handler),
+                            metadata, false);
+                }
+            } else if (POIFSDocumentType.OLE10_NATIVE == type) {
+                // TIKA-704: OLE 1.0 embedded document
+                Ole10Native ole =
+                        Ole10Native.createFromEmbeddedOleObject(fs);
+                if (ole.getLabel() != null) {
+                    metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
+                }
+                byte[] data = ole.getDataBuffer();
+                if (data != null) {
+                    stream = TikaInputStream.get(data);
+                }
+
+                if (stream != null
+                        && embeddedExtractor.shouldParseEmbedded(metadata)) {
+                    embeddedExtractor.parseEmbedded(
+                            stream, new EmbeddedContentHandler(handler),
+                            metadata, false);
+                }
+            } else {
+                handleEmbeddedFile(part, handler, rel);
+            }
+        } catch (FileNotFoundException e) {
+            // There was no CONTENTS entry, so skip this part
+        } catch (Ole10NativeException e) {
+            // Could not process an OLE 1.0 entry, so skip this part
+        }
+    }
+
+    /**
+     * Handles an embedded file in the document
+     */
+    protected void handleEmbeddedFile(PackagePart part, ContentHandler handler, String rel)
+            throws SAXException, IOException {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
+
+        // Get the name
+        String name = part.getPartName().getName();
+        metadata.set(
+                Metadata.RESOURCE_NAME_KEY,
+                name.substring(name.lastIndexOf('/') + 1));
+
+        // Get the content type
+        metadata.set(
+                Metadata.CONTENT_TYPE, part.getContentType());
+
+        // Call the recursing handler
+        if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+            embeddedExtractor.parseEmbedded(
+                    TikaInputStream.get(part.getInputStream()),
+                    new EmbeddedContentHandler(handler),
+                    metadata, false);
+        }
+    }
+
+    /**
+     * Populates the {@link XHTMLContentHandler} object received as parameter.
+     */
+    protected abstract void buildXHTML(XHTMLContentHandler xhtml)
+            throws SAXException, XmlException, IOException;
+
+    /**
+     * Return a list of the main parts of the document, used
+     * when searching for embedded resources.
+     * This should be all the parts of the document that end
+     * up with things embedded into them.
+     */
+    protected abstract List<PackagePart> getMainDocumentParts()
+            throws TikaException;
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.math.BigDecimal;
+import java.util.Date;
+
+import org.apache.poi.POIXMLProperties.CoreProperties;
+import org.apache.poi.POIXMLProperties.CustomProperties;
+import org.apache.poi.POIXMLProperties.ExtendedProperties;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
+import org.apache.poi.openxml4j.util.Nullable;
+import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.MSOffice;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.microsoft.SummaryExtractor;
+import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
+import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
+
+/**
+ * OOXML metadata extractor.
+ * <p/>
+ * Currently POI doesn't support metadata extraction for OOXML.
+ *
+ * @see OOXMLExtractor#getMetadataExtractor()
+ */
+public class MetadataExtractor {
+
+    private final POIXMLTextExtractor extractor;
+
+    public MetadataExtractor(POIXMLTextExtractor extractor) {
+        this.extractor = extractor;
+    }
+
+    public void extract(Metadata metadata) throws TikaException {
+        if (extractor.getDocument() != null ||
+                (extractor instanceof XSSFEventBasedExcelExtractor &&
+                        extractor.getPackage() != null)) {
+            extractMetadata(extractor.getCoreProperties(), metadata);
+            extractMetadata(extractor.getExtendedProperties(), metadata);
+            extractMetadata(extractor.getCustomProperties(), metadata);
+        }
+    }
+
+    private void extractMetadata(CoreProperties properties, Metadata metadata) {
+        PackagePropertiesPart propsHolder = properties
+                .getUnderlyingProperties();
+
+        addProperty(metadata, OfficeOpenXMLCore.CATEGORY, propsHolder.getCategoryProperty());
+        addProperty(metadata, OfficeOpenXMLCore.CONTENT_STATUS, propsHolder
+                .getContentStatusProperty());
+        addProperty(metadata, TikaCoreProperties.CREATED, propsHolder
+                .getCreatedProperty());
+        addMultiProperty(metadata, TikaCoreProperties.CREATOR, propsHolder
+                .getCreatorProperty());
+        addProperty(metadata, TikaCoreProperties.DESCRIPTION, propsHolder
+                .getDescriptionProperty());
+        addProperty(metadata, TikaCoreProperties.IDENTIFIER, propsHolder
+                .getIdentifierProperty());
+        addProperty(metadata, TikaCoreProperties.KEYWORDS, propsHolder
+                .getKeywordsProperty());
+        addProperty(metadata, TikaCoreProperties.LANGUAGE, propsHolder
+                .getLanguageProperty());
+        addProperty(metadata, TikaCoreProperties.MODIFIER, propsHolder
+                .getLastModifiedByProperty());
+        addProperty(metadata, TikaCoreProperties.PRINT_DATE, propsHolder
+                .getLastPrintedProperty());
+        addProperty(metadata, Metadata.LAST_MODIFIED, propsHolder
+                .getModifiedProperty());
+        addProperty(metadata, TikaCoreProperties.MODIFIED, propsHolder
+                .getModifiedProperty());
+        addProperty(metadata, OfficeOpenXMLCore.REVISION, propsHolder
+                .getRevisionProperty());
+        // TODO: Move to OO subject in Tika 2.0
+        addProperty(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT,
+                propsHolder.getSubjectProperty());
+        addProperty(metadata, TikaCoreProperties.TITLE, propsHolder.getTitleProperty());
+        addProperty(metadata, OfficeOpenXMLCore.VERSION, propsHolder.getVersionProperty());
+
+        // Legacy Tika-1.0 style stats
+        // TODO Remove these in Tika 2.0
+        addProperty(metadata, Metadata.CATEGORY, propsHolder.getCategoryProperty());
+        addProperty(metadata, Metadata.CONTENT_STATUS, propsHolder
+                .getContentStatusProperty());
+        addProperty(metadata, Metadata.REVISION_NUMBER, propsHolder
+                .getRevisionProperty());
+        addProperty(metadata, Metadata.VERSION, propsHolder.getVersionProperty());
+    }
+
+    private void extractMetadata(ExtendedProperties properties,
+                                 Metadata metadata) {
+        CTProperties propsHolder = properties.getUnderlyingProperties();
+
+        addProperty(metadata, OfficeOpenXMLExtended.APPLICATION, propsHolder.getApplication());
+        addProperty(metadata, OfficeOpenXMLExtended.APP_VERSION, propsHolder.getAppVersion());
+        addProperty(metadata, TikaCoreProperties.PUBLISHER, propsHolder.getCompany());
+        addProperty(metadata, OfficeOpenXMLExtended.COMPANY, propsHolder.getCompany());
+        SummaryExtractor.addMulti(metadata, OfficeOpenXMLExtended.MANAGER, propsHolder.getManager());
+        addProperty(metadata, OfficeOpenXMLExtended.NOTES, propsHolder.getNotes());
+        addProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
+        addProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, propsHolder.getTemplate());
+        addProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, propsHolder.getTotalTime());
+
+        if (propsHolder.getPages() > 0) {
+            metadata.set(PagedText.N_PAGES, propsHolder.getPages());
+        } else if (propsHolder.getSlides() > 0) {
+            metadata.set(PagedText.N_PAGES, propsHolder.getSlides());
+        }
+
+        // Process the document statistics
+        addProperty(metadata, Office.PAGE_COUNT, propsHolder.getPages());
+        addProperty(metadata, Office.SLIDE_COUNT, propsHolder.getSlides());
+        addProperty(metadata, Office.PARAGRAPH_COUNT, propsHolder.getParagraphs());
+        addProperty(metadata, Office.LINE_COUNT, propsHolder.getLines());
+        addProperty(metadata, Office.WORD_COUNT, propsHolder.getWords());
+        addProperty(metadata, Office.CHARACTER_COUNT, propsHolder.getCharacters());
+        addProperty(metadata, Office.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
+
+        // Legacy Tika-1.0 style stats
+        // TODO Remove these in Tika 2.0
+        addProperty(metadata, Metadata.APPLICATION_NAME, propsHolder.getApplication());
+        addProperty(metadata, Metadata.APPLICATION_VERSION, propsHolder.getAppVersion());
+        addProperty(metadata, Metadata.MANAGER, propsHolder.getManager());
+        addProperty(metadata, Metadata.NOTES, propsHolder.getNotes());
+        addProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
+        addProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate());
+        addProperty(metadata, Metadata.TOTAL_TIME, propsHolder.getTotalTime());
+        addProperty(metadata, MSOffice.PAGE_COUNT, propsHolder.getPages());
+        addProperty(metadata, MSOffice.SLIDE_COUNT, propsHolder.getSlides());
+        addProperty(metadata, MSOffice.PARAGRAPH_COUNT, propsHolder.getParagraphs());
+        addProperty(metadata, MSOffice.LINE_COUNT, propsHolder.getLines());
+        addProperty(metadata, MSOffice.WORD_COUNT, propsHolder.getWords());
+        addProperty(metadata, MSOffice.CHARACTER_COUNT, propsHolder.getCharacters());
+        addProperty(metadata, MSOffice.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
+    }
+
+    private void extractMetadata(CustomProperties properties,
+                                 Metadata metadata) {
+        org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties
+                props = properties.getUnderlyingProperties();
+        for (int i = 0; i < props.sizeOfPropertyArray(); i++) {
+            CTProperty property = props.getPropertyArray(i);
+            String val = null;
+            Date date = null;
+
+            if (property.isSetLpwstr()) {
+                val = property.getLpwstr();
+            } else if (property.isSetLpstr()) {
+                val = property.getLpstr();
+            } else if (property.isSetDate()) {
+                date = property.getDate().getTime();
+            } else if (property.isSetFiletime()) {
+                date = property.getFiletime().getTime();
+            } else if (property.isSetBool()) {
+                val = Boolean.toString(property.getBool());
+            }
+
+            // Integers
+            else if (property.isSetI1()) {
+                val = Integer.toString(property.getI1());
+            } else if (property.isSetI2()) {
+                val = Integer.toString(property.getI2());
+            } else if (property.isSetI4()) {
+                val = Integer.toString(property.getI4());
+            } else if (property.isSetI8()) {
+                val = Long.toString(property.getI8());
+            } else if (property.isSetInt()) {
+                val = Integer.toString(property.getInt());
+            }
+
+            // Unsigned Integers
+            else if (property.isSetUi1()) {
+                val = Integer.toString(property.getUi1());
+            } else if (property.isSetUi2()) {
+                val = Integer.toString(property.getUi2());
+            } else if (property.isSetUi4()) {
+                val = Long.toString(property.getUi4());
+            } else if (property.isSetUi8()) {
+                val = property.getUi8().toString();
+            } else if (property.isSetUint()) {
+                val = Long.toString(property.getUint());
+            }
+
+            // Reals
+            else if (property.isSetR4()) {
+                val = Float.toString(property.getR4());
+            } else if (property.isSetR8()) {
+                val = Double.toString(property.getR8());
+            } else if (property.isSetDecimal()) {
+                BigDecimal d = property.getDecimal();
+                if (d == null) {
+                    val = null;
+                } else {
+                    val = d.toPlainString();
+                }
+            } else if (property.isSetArray()) {
+                // TODO Fetch the array values and output
+            } else if (property.isSetVector()) {
+                // TODO Fetch the vector values and output
+            } else if (property.isSetBlob() || property.isSetOblob()) {
+                // TODO Decode, if possible
+            } else if (property.isSetStream() || property.isSetOstream() ||
+                    property.isSetVstream()) {
+                // TODO Decode, if possible
+            } else if (property.isSetStorage() || property.isSetOstorage()) {
+                // TODO Decode, if possible
+            } else {
+                // This type isn't currently supported yet, skip the property
+            }
+
+            String propName = "custom:" + property.getName();
+            if (date != null) {
+                Property tikaProp = Property.externalDate(propName);
+                metadata.set(tikaProp, date);
+            } else if (val != null) {
+                metadata.set(propName, val);
+            }
+        }
+    }
+
+    private <T> void addProperty(Metadata metadata, Property property, Nullable<T> nullableValue) {
+        T value = nullableValue.getValue();
+        if (value != null) {
+            if (value instanceof Date) {
+                metadata.set(property, (Date) value);
+            } else if (value instanceof String) {
+                metadata.set(property, (String) value);
+            } else if (value instanceof Integer) {
+                metadata.set(property, (Integer) value);
+            } else if (value instanceof Double) {
+                metadata.set(property, (Double) value);
+            }
+        }
+    }
+
+    private void addProperty(Metadata metadata, String name, Nullable<?> value) {
+        if (value.getValue() != null) {
+            addProperty(metadata, name, value.getValue().toString());
+        }
+    }
+
+    private void addProperty(Metadata metadata, Property property, String value) {
+        if (value != null) {
+            metadata.set(property, value);
+        }
+    }
+
+    private void addProperty(Metadata metadata, String name, String value) {
+        if (value != null) {
+            metadata.set(name, value);
+        }
+    }
+
+    private void addProperty(Metadata metadata, Property property, int value) {
+        if (value > 0) {
+            metadata.set(property, value);
+        }
+    }
+
+    private void addProperty(Metadata metadata, String name, int value) {
+        if (value > 0) {
+            metadata.set(name, Integer.toString(value));
+        }
+    }
+
+    private void addMultiProperty(Metadata metadata, Property property, Nullable<String> value) {
+        if (value == null) {
+            return;
+        }
+        SummaryExtractor.addMulti(metadata, property, value.getValue());
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Interface implemented by all Tika OOXML extractors.
+ *
+ * @see org.apache.poi.POIXMLTextExtractor
+ */
+public interface OOXMLExtractor {
+
+    /**
+     * Returns the opened document.
+     *
+     * @see POIXMLTextExtractor#getDocument()
+     */
+    POIXMLDocument getDocument();
+
+    /**
+     * {@link POIXMLTextExtractor#getMetadataTextExtractor()} not yet supported
+     * for OOXML by POI.
+     */
+    MetadataExtractor getMetadataExtractor();
+
+    /**
+     * Parses the document into a sequence of XHTML SAX events sent to the
+     * given content handler.
+     */
+    void getXHTML(ContentHandler handler, Metadata metadata, ParseContext context)
+            throws SAXException, XmlException, IOException, TikaException;
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Locale;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackageAccess;
+import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.usermodel.XMLSlideShow;
+import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pkg.ZipContainerDetector;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Figures out the correct {@link OOXMLExtractor} for the supplied document and
+ * returns it.
+ */
+public class OOXMLExtractorFactory {
+
+    public static void parse(
+            InputStream stream, ContentHandler baseHandler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        Locale locale = context.get(Locale.class, Locale.getDefault());
+        ExtractorFactory.setThreadPrefersEventExtractors(true);
+
+        try {
+            OOXMLExtractor extractor;
+            OPCPackage pkg;
+
+            // Locate or Open the OPCPackage for the file
+            TikaInputStream tis = TikaInputStream.cast(stream);
+            if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
+                pkg = (OPCPackage) tis.getOpenContainer();
+            } else if (tis != null && tis.hasFile()) {
+                pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
+                tis.setOpenContainer(pkg);
+            } else {
+                InputStream shield = new CloseShieldInputStream(stream);
+                pkg = OPCPackage.open(shield);
+            }
+
+            // Get the type, and ensure it's one we handle
+            MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
+            if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
+                // Not a supported type, delegate to Empty Parser
+                EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
+                return;
+            }
+            metadata.set(Metadata.CONTENT_TYPE, type.toString());
+
+            // Have the appropriate OOXML text extractor picked
+            POIXMLTextExtractor poiExtractor = ExtractorFactory.createExtractor(pkg);
+
+            POIXMLDocument document = poiExtractor.getDocument();
+            if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
+                extractor = new XSSFExcelExtractorDecorator(
+                        context, (XSSFEventBasedExcelExtractor) poiExtractor, locale);
+            } else if (document == null) {
+                throw new TikaException(
+                        "Expecting UserModel based POI OOXML extractor with a document, but none found. " +
+                                "The extractor returned was a " + poiExtractor
+                );
+            } else if (document instanceof XMLSlideShow) {
+                extractor = new XSLFPowerPointExtractorDecorator(
+                        context, (XSLFPowerPointExtractor) poiExtractor);
+            } else if (document instanceof XWPFDocument) {
+                extractor = new XWPFWordExtractorDecorator(
+                        context, (XWPFWordExtractor) poiExtractor);
+            } else {
+                extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
+            }
+
+            // Get the bulk of the metadata first, so that it's accessible during
+            //  parsing if desired by the client (see TIKA-1109)
+            extractor.getMetadataExtractor().extract(metadata);
+
+            // Extract the text, along with any in-document metadata
+            extractor.getXHTML(baseHandler, metadata, context);
+        } catch (IllegalArgumentException e) {
+            if (e.getMessage() != null &&
+                    e.getMessage().startsWith("No supported documents found")) {
+                throw new TikaException(
+                        "TIKA-418: RuntimeException while getting content"
+                                + " for thmx and xps file types", e);
+            } else {
+                throw new TikaException("Error creating OOXML extractor", e);
+            }
+        } catch (InvalidFormatException e) {
+            throw new TikaException("Error creating OOXML extractor", e);
+        } catch (OpenXML4JException e) {
+            throw new TikaException("Error creating OOXML extractor", e);
+        } catch (XmlException e) {
+            throw new TikaException("Error creating OOXML extractor", e);
+
+        }
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.openxml4j.util.ZipSecureFile;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Office Open XML (OOXML) parser.
+ */
+public class OOXMLParser extends AbstractParser {
+    static {
+        //turn off POI's zip bomb detection because we have our own
+        ZipSecureFile.setMinInflateRatio(-1.0d);
+    }
+
+    protected static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("x-tika-ooxml"),
+                    MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"),
+                    MediaType.application("vnd.ms-powerpoint.presentation.macroenabled.12"),
+                    MediaType.application("vnd.openxmlformats-officedocument.presentationml.template"),
+                    MediaType.application("vnd.openxmlformats-officedocument.presentationml.slideshow"),
+                    MediaType.application("vnd.ms-powerpoint.slideshow.macroenabled.12"),
+                    MediaType.application("vnd.ms-powerpoint.addin.macroenabled.12"),
+                    MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
+                    MediaType.application("vnd.ms-excel.sheet.macroenabled.12"),
+                    MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.template"),
+                    MediaType.application("vnd.ms-excel.template.macroenabled.12"),
+                    MediaType.application("vnd.ms-excel.addin.macroenabled.12"),
+                    MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"),
+                    MediaType.application("vnd.ms-word.document.macroenabled.12"),
+                    MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.template"),
+                    MediaType.application("vnd.ms-word.template.macroenabled.12"))));
+    /**
+     * We claim to support all OOXML files, but we actually don't support a small
+     * number of them.
+     * This list is used to decline certain formats that are not yet supported
+     * by Tika and/or POI.
+     */
+    protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12"),
+                    MediaType.application("vnd.ms-xpsdocument")
+            )));
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = 6535995710857776481L;
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // Have the OOXML file processed
+        OOXMLExtractorFactory.parse(stream, handler, metadata, context);
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+public class POIXMLTextExtractorDecorator extends AbstractOOXMLExtractor {
+
+    public POIXMLTextExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor) {
+        super(context, extractor);
+    }
+
+    @Override
+    protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException {
+        // extract document content as a single string (not structured)
+        xhtml.element("p", extractor.getText());
+    }
+
+    @Override
+    protected List<PackagePart> getMainDocumentParts() {
+        return new ArrayList<PackagePart>();
+    }
+}