You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/16 19:23:06 UTC
svn commit: r1725014 [9/28] - in /tika/branches/2.x: tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-parser-module/ tika-parser-modules/tika-advanced-parser-mo...

Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Locale;
+import java.util.Set;
+
+import com.healthmarketscience.jackcess.CryptCodecProvider;
+import com.healthmarketscience.jackcess.Database;
+import com.healthmarketscience.jackcess.DatabaseBuilder;
+import com.healthmarketscience.jackcess.util.LinkResolver;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser that handles Microsoft Access files via
+ * <a href="http://jackcess.sourceforge.net/>Jackcess</a>
+ * <p>
+ * Many, many thanks to LexisNexisÂ®/Health Market Science (HMS), Brian O'Neill,
+ * and James Ahlborn for relicensing Jackcess to Apache v2.0!
+ */
+public class JackcessParser extends AbstractParser {
+
+    public static final String SUMMARY_PROPERTY_PREFIX = "MDB_SUMMARY_PROP" + Metadata.NAMESPACE_PREFIX_DELIMITER;
+    public static String MDB_PROPERTY_PREFIX = "MDB_PROP" + Metadata.NAMESPACE_PREFIX_DELIMITER;
+    public static String USER_DEFINED_PROPERTY_PREFIX = "MDB_USER_PROP" + Metadata.NAMESPACE_PREFIX_DELIMITER;
+    public static Property MDB_PW = Property.externalText("Password");
+    private final static LinkResolver IGNORE_LINK_RESOLVER = new IgnoreLinkResolver();
+
+    //TODO: figure out how to get this info
+    // public static Property LINKED_DATABASES = Property.externalTextBag("LinkedDatabases");
+
+    private static final long serialVersionUID = -752276948656079347L;
+
+    private static final MediaType MEDIA_TYPE = MediaType.application("x-msaccess");
+
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE);
+
+    private Locale locale = Locale.ROOT;
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+                      ParseContext context) throws IOException, SAXException, TikaException {
+        TikaInputStream tis = TikaInputStream.get(stream);
+        Database db = null;
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        String password = null;
+        PasswordProvider passwordProvider = context.get(PasswordProvider.class);
+        if (passwordProvider != null) {
+            password = passwordProvider.getPassword(metadata);
+        }
+        try {
+            if (password == null) {
+                //do this to ensure encryption/wrong password exception vs. more generic
+                //"need right codec" error message.
+                db = new DatabaseBuilder(tis.getFile())
+                        .setCodecProvider(new CryptCodecProvider())
+                        .setReadOnly(true).open();
+            } else {
+                db = new DatabaseBuilder(tis.getFile())
+                        .setCodecProvider(new CryptCodecProvider(password))
+                        .setReadOnly(true).open();
+            }
+            db.setLinkResolver(IGNORE_LINK_RESOLVER);//just in case
+            JackcessExtractor ex = new JackcessExtractor(context, locale);
+            ex.parse(db, xhtml, metadata);
+        } catch (IllegalStateException e) {
+            if (e.getMessage() != null && e.getMessage().contains("Incorrect password")) {
+                throw new EncryptedDocumentException(e);
+            }
+            throw e;
+        } finally {
+            if (db != null) {
+                try {
+                    db.close();
+                } catch (IOException e) {
+                    //swallow = silent close
+                }
+            }
+        }
+        xhtml.endDocument();
+    }
+
+    private static final class IgnoreLinkResolver implements LinkResolver {
+        //If links are resolved, Jackcess might try to open and process
+        //any file on the current system that is specified as a linked db.
+        //This could be a nasty security issue.
+        @Override
+        public Database resolveLinkedDatabase(Database database, String s) throws IOException {
+            throw new AssertionError("DO NOT ALLOW RESOLVING OF LINKS!!!");
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Linked cell. This class decorates another content cell with a hyperlink.
+ */
+public class LinkedCell extends CellDecorator {
+
+    private final String link;
+
+    public LinkedCell(Cell cell, String link) {
+        super(cell);
+        assert link != null;
+        this.link = link;
+    }
+
+    public void render(XHTMLContentHandler handler) throws SAXException {
+        handler.startElement("a", "href", link);
+        super.render(handler);
+        handler.endElement("a");
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ListManager.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ListManager.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ListManager.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.util.NoSuchElementException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.ListData;
+import org.apache.poi.hwpf.model.ListFormatOverrideLevel;
+import org.apache.poi.hwpf.model.ListLevel;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+
+/**
+ * Computes the number text which goes at the beginning of each list paragraph
+ * <p/>
+ * <p><em>Note:</em> This class only handles the raw number text and does not apply any further formatting as described in [MS-DOC], v20140721, 2.4.6.3, Part 3 to it.<p>
+ * <p><em>Note 2:</em> The {@code tplc}, a visual override for the appearance of list levels, as defined in [MS-DOC], v20140721, 2.9.328 is not taken care of in this class.</p>
+ * <p>Further, this class does not yet handle overrides</p>
+ */
+public class ListManager extends AbstractListManager {
+
+    private static final Log logger = LogFactory.getLog(ListManager.class);
+    private final ListTables listTables;
+
+    /**
+     * Ordinary constructor for a new list reader
+     *
+     * @param document Document to process
+     */
+    public ListManager(final HWPFDocument document) {
+        this.listTables = document.getListTables();
+    }
+
+    /**
+     * Get the formatted number for a given paragraph
+     * <p/>
+     * <p><em>Note:</em> This only works correctly if called subsequently for <em>all</em> paragraphs in a valid selection (main document, text field, ...) which are part of a list.</p>
+     *
+     * @param paragraph list paragraph to process
+     * @return String which represents the numbering of this list paragraph; never {@code null}, can be empty string, though, 
+     *        if something goes wrong in getList()
+     * @throws IllegalArgumentException If the given paragraph is {@code null} or is not part of a list
+     */
+    public String getFormattedNumber(final Paragraph paragraph) {
+        if (paragraph == null) throw new IllegalArgumentException("Given paragraph cannot be null.");
+        if (!paragraph.isInList()) throw new IllegalArgumentException("Can only process list paragraphs.");
+        //lsid is equivalent to docx's abnum
+        //ilfo is equivalent to docx's num
+        int currAbNumId = -1;
+        try{
+            currAbNumId = paragraph.getList().getLsid();
+        } catch (NoSuchElementException e) {
+            //somewhat frequent exception when initializing HWPFList
+            return "";
+        } catch (IllegalArgumentException e) {
+            return "";
+        } catch (NullPointerException e) {
+            return "";
+        }
+
+        int currNumId = paragraph.getIlfo();
+        ParagraphLevelCounter lc = listLevelMap.get(currAbNumId);
+        LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId);
+
+        if (lc == null) {
+            ListData listData = listTables.getListData(paragraph.getList().getLsid());
+            LevelTuple[] levelTuples = new LevelTuple[listData.getLevels().length];
+            for (int i = 0; i < listData.getLevels().length; i++) {
+                levelTuples[i] = buildTuple(i, listData.getLevels()[i]);
+            }
+            lc = new ParagraphLevelCounter(levelTuples);
+        }
+        if (overrideTuples == null) {
+            overrideTuples = buildOverrideTuples(paragraph, lc.getNumberOfLevels());
+        }
+        String formattedString = lc.incrementLevel(paragraph.getIlvl(), overrideTuples);
+
+        listLevelMap.put(currAbNumId, lc);
+        overrideTupleMap.put(currNumId, overrideTuples);
+        return formattedString;
+    }
+
+    private LevelTuple buildTuple(int i, ListLevel listLevel) {
+        boolean isLegal = false;
+        int start = 1;
+        int restart = -1;
+        String lvlText = "%" + i + ".";
+        String numFmt = "decimal";
+
+        start = listLevel.getStartAt();
+        restart = listLevel.getRestart();
+        isLegal = listLevel.isLegalNumbering();
+        numFmt = convertToNewNumFormat(listLevel.getNumberFormat());
+        lvlText = convertToNewNumberText(listLevel.getNumberText(), listLevel.getLevelNumberingPlaceholderOffsets());
+        return new LevelTuple(start, restart, lvlText, numFmt, isLegal);
+    }
+
+    private LevelTuple[] buildOverrideTuples(Paragraph par, int length) {
+        ListFormatOverrideLevel overrideLevel;
+        // find the override for this level
+        if (listTables.getLfoData(par.getIlfo()).getRgLfoLvl().length == 0) {
+            return null;
+        }
+        overrideLevel = listTables.getLfoData(par.getIlfo()).getRgLfoLvl()[0];
+        if (overrideLevel == null) {
+            return null;
+        }
+        LevelTuple[] levelTuples = new LevelTuple[length];
+        ListLevel listLevel = overrideLevel.getLevel();
+        if (listLevel == null) {
+            return null;
+        }
+        for (int i = 0; i < length; i++) {
+            levelTuples[i] = buildTuple(i, listLevel);
+        }
+
+        return levelTuples;
+
+    }
+
+    private String convertToNewNumberText(String numberText, byte[] numberOffsets) {
+
+        StringBuilder sb = new StringBuilder();
+        int last = 0;
+        for (int i = 0; i < numberOffsets.length; i++) {
+            int offset = (int) numberOffsets[i];
+
+            if (offset == 0) {
+                break;
+            }
+            sb.append(numberText.substring(last, offset - 1));
+            //need to add one because newer format
+            //adds one.  In .doc, this was the array index;
+            //but in .docx, this is the level number
+            int lvlNum = (int) numberText.charAt(offset - 1) + 1;
+            sb.append("%" + lvlNum);
+            last = offset;
+        }
+        if (last < numberText.length()) {
+            sb.append(numberText.substring(last));
+        }
+        return sb.toString();
+    }
+
+    private String convertToNewNumFormat(int numberFormat) {
+        switch (numberFormat) {
+            case -1:
+                return "none";
+            case 0:
+                return "decimal";
+            case 1:
+                return "upperRoman";
+            case 2:
+                return "lowerRoman";
+            case 3:
+                return "upperLetter";
+            case 4:
+                return "lowerLetter";
+            case 5:
+                return "ordinal";
+            case 22:
+                return "decimalZero";
+            case 23:
+                return "bullet";
+            case 47:
+                return "none";
+            default:
+                //do we really want to silently swallow these uncovered cases?
+                //throw new RuntimeException("NOT COVERED: " + numberFormat);
+                return "decimal";
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.text.NumberFormat;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Number cell.
+ */
+public class NumberCell implements Cell {
+
+    private final double number;
+
+    private final NumberFormat format;
+
+    public NumberCell(double number, NumberFormat format) {
+        this.number = number;
+        this.format = format;
+    }
+
+    public void render(XHTMLContentHandler handler) throws SAXException {
+        handler.characters(format.format(number));
+    }
+
+    public String toString() {
+        return "Numeric Cell: " + format.format(number);
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,273 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.security.GeneralSecurityException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.poi.hdgf.extractor.VisioTextExtractor;
+import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
+import org.apache.poi.poifs.crypt.Decryptor;
+import org.apache.poi.poifs.crypt.EncryptionInfo;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Defines a Microsoft document content extractor.
+ */
+public class OfficeParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = 7393462244028653479L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    POIFSDocumentType.WORKBOOK.type,
+                    POIFSDocumentType.OLE10_NATIVE.type,
+                    POIFSDocumentType.WORDDOCUMENT.type,
+                    POIFSDocumentType.UNKNOWN.type,
+                    POIFSDocumentType.ENCRYPTED.type,
+                    POIFSDocumentType.POWERPOINT.type,
+                    POIFSDocumentType.PUBLISHER.type,
+                    POIFSDocumentType.PROJECT.type,
+                    POIFSDocumentType.VISIO.type,
+                    // Works isn't supported
+                    POIFSDocumentType.XLR.type, // but Works 7.0 Spreadsheet is
+                    POIFSDocumentType.OUTLOOK.type,
+                    POIFSDocumentType.SOLIDWORKS_PART.type,
+                    POIFSDocumentType.SOLIDWORKS_ASSEMBLY.type,
+                    POIFSDocumentType.SOLIDWORKS_DRAWING.type
+            )));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * Extracts properties and text from an MS Document input stream
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        final DirectoryNode root;
+        TikaInputStream tstream = TikaInputStream.cast(stream);
+        if (tstream == null) {
+            root = new NPOIFSFileSystem(new CloseShieldInputStream(stream)).getRoot();
+        } else {
+            final Object container = tstream.getOpenContainer();
+            if (container instanceof NPOIFSFileSystem) {
+                root = ((NPOIFSFileSystem) container).getRoot();
+            } else if (container instanceof DirectoryNode) {
+                root = (DirectoryNode) container;
+            } else {
+                NPOIFSFileSystem fs;
+                if (tstream.hasFile()) {
+                    fs = new NPOIFSFileSystem(tstream.getFile(), true);
+                } else {
+                    fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
+                }
+                tstream.setOpenContainer(fs);
+                root = fs.getRoot();
+            }
+        }
+        parse(root, context, metadata, xhtml);
+        xhtml.endDocument();
+    }
+
+    protected void parse(
+            DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+
+        // Parse summary entries first, to make metadata available early
+        new SummaryExtractor(metadata).parseSummaries(root);
+
+        // Parse remaining document entries
+        POIFSDocumentType type = POIFSDocumentType.detectType(root);
+
+        if (type != POIFSDocumentType.UNKNOWN) {
+            setType(metadata, type.getType());
+        }
+
+        switch (type) {
+            case SOLIDWORKS_PART:
+            case SOLIDWORKS_ASSEMBLY:
+            case SOLIDWORKS_DRAWING:
+                break;
+            case PUBLISHER:
+                PublisherTextExtractor publisherTextExtractor =
+                        new PublisherTextExtractor(root);
+                xhtml.element("p", publisherTextExtractor.getText());
+                break;
+            case WORDDOCUMENT:
+                new WordExtractor(context).parse(root, xhtml);
+                break;
+            case POWERPOINT:
+                new HSLFExtractor(context).parse(root, xhtml);
+                break;
+            case WORKBOOK:
+            case XLR:
+                Locale locale = context.get(Locale.class, Locale.getDefault());
+                new ExcelExtractor(context, metadata).parse(root, xhtml, locale);
+                break;
+            case PROJECT:
+                // We currently can't do anything beyond the metadata
+                break;
+            case VISIO:
+                VisioTextExtractor visioTextExtractor =
+                        new VisioTextExtractor(root);
+                for (String text : visioTextExtractor.getAllText()) {
+                    xhtml.element("p", text);
+                }
+                break;
+            case OUTLOOK:
+                OutlookExtractor extractor =
+                        new OutlookExtractor(root, context);
+
+                extractor.parse(xhtml, metadata);
+                break;
+            case ENCRYPTED:
+                EncryptionInfo info = new EncryptionInfo(root);
+                Decryptor d = Decryptor.getInstance(info);
+
+                try {
+                    // By default, use the default Office Password
+                    String password = Decryptor.DEFAULT_PASSWORD;
+
+                    // If they supplied a Password Provider, ask that for the password,
+                    //  and use the provider given one if available (stick with default if not)
+                    PasswordProvider passwordProvider = context.get(PasswordProvider.class);
+                    if (passwordProvider != null) {
+                        String suppliedPassword = passwordProvider.getPassword(metadata);
+                        if (suppliedPassword != null) {
+                            password = suppliedPassword;
+                        }
+                    }
+
+                    // Check if we've the right password or not
+                    if (!d.verifyPassword(password)) {
+                        throw new EncryptedDocumentException();
+                    }
+
+                    // Decrypt the OLE2 stream, and delegate the resulting OOXML
+                    //  file to the regular OOXML parser for normal handling
+                    OOXMLParser parser = new OOXMLParser();
+
+                    parser.parse(d.getDataStream(root), new EmbeddedContentHandler(
+                                    new BodyContentHandler(xhtml)),
+                            metadata, context);
+                } catch (GeneralSecurityException ex) {
+                    throw new EncryptedDocumentException(ex);
+                }
+            default:
+                // For unsupported / unhandled types, just the metadata
+                //  is extracted, which happened above
+                break;
+        }
+    }
+
+    private void setType(Metadata metadata, MediaType type) {
+        metadata.set(Metadata.CONTENT_TYPE, type.toString());
+    }
+
+    public enum POIFSDocumentType {
+        WORKBOOK("xls", MediaType.application("vnd.ms-excel")),
+        OLE10_NATIVE("ole", POIFSContainerDetector.OLE10_NATIVE),
+        COMP_OBJ("ole", POIFSContainerDetector.COMP_OBJ),
+        WORDDOCUMENT("doc", MediaType.application("msword")),
+        UNKNOWN("unknown", MediaType.application("x-tika-msoffice")),
+        ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")),
+        POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")),
+        PUBLISHER("pub", MediaType.application("x-mspublisher")),
+        PROJECT("mpp", MediaType.application("vnd.ms-project")),
+        VISIO("vsd", MediaType.application("vnd.visio")),
+        WORKS("wps", MediaType.application("vnd.ms-works")),
+        XLR("xlr", MediaType.application("x-tika-msworks-spreadsheet")),
+        OUTLOOK("msg", MediaType.application("vnd.ms-outlook")),
+        SOLIDWORKS_PART("sldprt", MediaType.application("sldworks")),
+        SOLIDWORKS_ASSEMBLY("sldasm", MediaType.application("sldworks")),
+        SOLIDWORKS_DRAWING("slddrw", MediaType.application("sldworks"));
+
+        private final String extension;
+        private final MediaType type;
+
+        POIFSDocumentType(String extension, MediaType type) {
+            this.extension = extension;
+            this.type = type;
+        }
+
+        public static POIFSDocumentType detectType(POIFSFileSystem fs) {
+            return detectType(fs.getRoot());
+        }
+
+        public static POIFSDocumentType detectType(NPOIFSFileSystem fs) {
+            return detectType(fs.getRoot());
+        }
+
+        public static POIFSDocumentType detectType(DirectoryEntry node) {
+            Set<String> names = new HashSet<String>();
+            for (Entry entry : node) {
+                names.add(entry.getName());
+            }
+            MediaType type = POIFSContainerDetector.detect(names, node);
+            for (POIFSDocumentType poifsType : values()) {
+                if (type.equals(poifsType.type)) {
+                    return poifsType;
+                }
+            }
+            return UNKNOWN;
+        }
+
+        public String getExtension() {
+            return extension;
+        }
+
+        public MediaType getType() {
+            return type;
+        }
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.hssf.extractor.OldExcelExtractor;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A POI-powered Tika Parser for very old versions of Excel, from
+ * pre-OLE2 days, such as Excel 4.
+ */
+public class OldExcelParser extends AbstractParser {
+    private static final long serialVersionUID = 4611820730372823452L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("vnd.ms-excel.sheet.4"),
+                    MediaType.application("vnd.ms-excel.workspace.4"),
+                    MediaType.application("vnd.ms-excel.sheet.3"),
+                    MediaType.application("vnd.ms-excel.workspace.3"),
+                    MediaType.application("vnd.ms-excel.sheet.2")
+            )));
+
+    protected static void parse(OldExcelExtractor extractor,
+                                XHTMLContentHandler xhtml) throws TikaException, IOException, SAXException {
+        // Get the whole text, as a single string
+        String text = extractor.getText();
+
+        // Split and output
+        xhtml.startDocument();
+
+        String line;
+        BufferedReader reader = new BufferedReader(new StringReader(text));
+        while ((line = reader.readLine()) != null) {
+            xhtml.startElement("p");
+            xhtml.characters(line);
+            xhtml.endElement("p");
+        }
+
+        xhtml.endDocument();
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * Extracts properties and text from an MS Document input stream
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // Open the POI provided extractor
+        OldExcelExtractor extractor = new OldExcelExtractor(stream);
+
+        // We can't do anything about metadata, as these old formats
+        //  didn't have any stored with them
+
+        // Set the content type
+        // TODO Get the version and type, to set as the Content Type
+
+        // Have the text extracted and given to our Content Handler
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        parse(extractor, xhtml);
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,386 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
+import java.text.ParseException;
+import java.util.Date;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.poi.hmef.attribute.MAPIRtfAttribute;
+import org.apache.poi.hsmf.MAPIMessage;
+import org.apache.poi.hsmf.datatypes.AttachmentChunks;
+import org.apache.poi.hsmf.datatypes.ByteChunk;
+import org.apache.poi.hsmf.datatypes.Chunk;
+import org.apache.poi.hsmf.datatypes.Chunks;
+import org.apache.poi.hsmf.datatypes.MAPIProperty;
+import org.apache.poi.hsmf.datatypes.PropertyValue;
+import org.apache.poi.hsmf.datatypes.StringChunk;
+import org.apache.poi.hsmf.datatypes.Types;
+import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.util.CodePageUtil;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.HtmlEncodingDetector;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.parser.mbox.MboxParser;
+import org.apache.tika.parser.rtf.RTFParser;
+import org.apache.tika.parser.txt.CharsetDetector;
+import org.apache.tika.parser.txt.CharsetMatch;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Outlook Message Parser.
+ */
+public class OutlookExtractor extends AbstractPOIFSExtractor {
+    private static final Metadata EMPTY_METADATA = new Metadata();
+    HtmlEncodingDetector detector = new HtmlEncodingDetector();
+
+    private final MAPIMessage msg;
+
+    public OutlookExtractor(NPOIFSFileSystem filesystem, ParseContext context) throws TikaException {
+        this(filesystem.getRoot(), context);
+    }
+
+    public OutlookExtractor(DirectoryNode root, ParseContext context) throws TikaException {
+        super(context);
+
+        try {
+            this.msg = new MAPIMessage(root);
+        } catch (IOException e) {
+            throw new TikaException("Failed to parse Outlook message", e);
+        }
+    }
+
+    public void parse(XHTMLContentHandler xhtml, Metadata metadata)
+            throws TikaException, SAXException, IOException {
+        try {
+            msg.setReturnNullOnMissingChunk(true);
+
+            // If the message contains strings that aren't stored
+            //  as Unicode, try to sort out an encoding for them
+            if (msg.has7BitEncodingStrings()) {
+                guess7BitEncoding(msg);
+            }
+
+            // Start with the metadata
+            String subject = msg.getSubject();
+            String from = msg.getDisplayFrom();
+
+            metadata.set(TikaCoreProperties.CREATOR, from);
+            metadata.set(Metadata.MESSAGE_FROM, from);
+            metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
+            metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
+            metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
+
+            metadata.set(TikaCoreProperties.TITLE, subject);
+            // TODO: Move to description in Tika 2.0
+            metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
+                    msg.getConversationTopic());
+
+            try {
+                for (String recipientAddress : msg.getRecipientEmailAddressList()) {
+                    if (recipientAddress != null)
+                        metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
+                }
+            } catch (ChunkNotFoundException he) {
+            } // Will be fixed in POI 3.7 Final
+
+            // Date - try two ways to find it
+            // First try via the proper chunk
+            if (msg.getMessageDate() != null) {
+                metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
+                metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
+            } else {
+                try {
+                    // Failing that try via the raw headers
+                    String[] headers = msg.getHeaders();
+                    if (headers != null && headers.length > 0) {
+                        for (String header : headers) {
+                            if (header.toLowerCase(Locale.ROOT).startsWith("date:")) {
+                                String date = header.substring(header.indexOf(':') + 1).trim();
+
+                                // See if we can parse it as a normal mail date
+                                try {
+                                    Date d = MboxParser.parseDate(date);
+                                    metadata.set(TikaCoreProperties.CREATED, d);
+                                    metadata.set(TikaCoreProperties.MODIFIED, d);
+                                } catch (ParseException e) {
+                                    // Store it as-is, and hope for the best...
+                                    metadata.set(TikaCoreProperties.CREATED, date);
+                                    metadata.set(TikaCoreProperties.MODIFIED, date);
+                                }
+                                break;
+                            }
+                        }
+                    }
+                } catch (ChunkNotFoundException he) {
+                    // We can't find the date, sorry...
+                }
+            }
+
+
+            xhtml.element("h1", subject);
+
+            // Output the from and to details in text, as you
+            //  often want them in text form for searching
+            xhtml.startElement("dl");
+            if (from != null) {
+                header(xhtml, "From", from);
+            }
+            header(xhtml, "To", msg.getDisplayTo());
+            header(xhtml, "Cc", msg.getDisplayCC());
+            header(xhtml, "Bcc", msg.getDisplayBCC());
+            try {
+                header(xhtml, "Recipients", msg.getRecipientEmailAddress());
+            } catch (ChunkNotFoundException e) {
+            }
+            xhtml.endElement("dl");
+
+            // Get the message body. Preference order is: html, rtf, text
+            Chunk htmlChunk = null;
+            Chunk rtfChunk = null;
+            Chunk textChunk = null;
+            for (Chunk chunk : msg.getMainChunks().getChunks()) {
+                if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
+                    htmlChunk = chunk;
+                }
+                if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
+                    rtfChunk = chunk;
+                }
+                if (chunk.getChunkId() == MAPIProperty.BODY.id) {
+                    textChunk = chunk;
+                }
+            }
+
+            boolean doneBody = false;
+            xhtml.startElement("div", "class", "message-body");
+            if (htmlChunk != null) {
+                byte[] data = null;
+                if (htmlChunk instanceof ByteChunk) {
+                    data = ((ByteChunk) htmlChunk).getValue();
+                } else if (htmlChunk instanceof StringChunk) {
+                    data = ((StringChunk) htmlChunk).getRawValue();
+                }
+                if (data != null) {
+                    HtmlParser htmlParser = new HtmlParser();
+                    htmlParser.parse(
+                            new ByteArrayInputStream(data),
+                            new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+                            new Metadata(), new ParseContext()
+                    );
+                    doneBody = true;
+                }
+            }
+            if (rtfChunk != null && !doneBody) {
+                ByteChunk chunk = (ByteChunk) rtfChunk;
+                MAPIRtfAttribute rtf = new MAPIRtfAttribute(
+                        MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()
+                );
+                RTFParser rtfParser = new RTFParser();
+                rtfParser.parse(
+                        new ByteArrayInputStream(rtf.getData()),
+                        new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+                        new Metadata(), new ParseContext());
+                doneBody = true;
+            }
+            if (textChunk != null && !doneBody) {
+                xhtml.element("p", ((StringChunk) textChunk).getValue());
+            }
+            xhtml.endElement("div");
+
+            // Process the attachments
+            for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
+                xhtml.startElement("div", "class", "attachment-entry");
+
+                String filename = null;
+                if (attachment.attachLongFileName != null) {
+                    filename = attachment.attachLongFileName.getValue();
+                } else if (attachment.attachFileName != null) {
+                    filename = attachment.attachFileName.getValue();
+                }
+                if (filename != null && filename.length() > 0) {
+                    xhtml.element("h1", filename);
+                }
+
+                if (attachment.attachData != null) {
+                    handleEmbeddedResource(
+                            TikaInputStream.get(attachment.attachData.getValue()),
+                            filename, null,
+                            null, xhtml, true
+                    );
+                }
+                if (attachment.attachmentDirectory != null) {
+                    handleEmbeddedOfficeDoc(
+                            attachment.attachmentDirectory.getDirectory(),
+                            xhtml
+                    );
+                }
+
+                xhtml.endElement("div");
+            }
+        } catch (ChunkNotFoundException e) {
+            throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
+        }
+    }
+
+    private void header(XHTMLContentHandler xhtml, String key, String value)
+            throws SAXException {
+        if (value != null && value.length() > 0) {
+            xhtml.element("dt", key);
+            xhtml.element("dd", value);
+        }
+    }
+
+    /**
+     * Tries to identify the correct encoding for 7-bit (non-unicode)
+     *  strings in the file.
+     * <p>Many messages store their strings as unicode, which is
+     *  nice and easy. Some use one-byte encodings for their
+     *  strings, but don't always store the encoding anywhere
+     *  helpful in the file.</p>
+     * <p>This method checks for codepage properties, and failing that
+     *  looks at the headers for the message, and uses these to
+     *  guess the correct encoding for your file.</p>
+     * <p>Bug #49441 has more on why this is needed</p>
+     * <p>This is taken verbatim from POI (TIKA-1238)
+     * as a temporary workaround to prevent unsupported encoding exceptions</p>
+     */
+    private void guess7BitEncoding(MAPIMessage msg) {
+        Chunks mainChunks = msg.getMainChunks();
+        //sanity check
+        if (mainChunks == null) {
+            return;
+        }
+
+        Map<MAPIProperty, List<PropertyValue>> props = mainChunks.getProperties();
+        if (props != null) {
+            // First choice is a codepage property
+            for (MAPIProperty prop : new MAPIProperty[]{
+                    MAPIProperty.MESSAGE_CODEPAGE,
+                    MAPIProperty.INTERNET_CPID
+            }) {
+                List<PropertyValue> val = props.get(prop);
+                if (val != null && val.size() > 0) {
+                    int codepage = ((PropertyValue.LongPropertyValue) val.get(0)).getValue();
+                    String encoding = null;
+                    try {
+                        encoding = CodePageUtil.codepageToEncoding(codepage, true);
+                    } catch (UnsupportedEncodingException e) {
+                        //swallow
+                    }
+                    if (tryToSet7BitEncoding(msg, encoding)) {
+                        return;
+                    }
+                }
+            }
+        }
+
+        // Second choice is a charset on a content type header
+        try {
+            String[] headers = msg.getHeaders();
+            if(headers != null && headers.length > 0) {
+                // Look for a content type with a charset
+                Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE);
+
+                for(String header : headers) {
+                    if(header.startsWith("Content-Type")) {
+                        Matcher m = p.matcher(header);
+                        if(m.matches()) {
+                            // Found it! Tell all the string chunks
+                            String charset = m.group(1);
+                            if (tryToSet7BitEncoding(msg, charset)) {
+                                return;
+                            }
+                        }
+                    }
+                }
+            }
+        } catch(ChunkNotFoundException e) {}
+
+        // Nothing suitable in the headers, try HTML
+        // TODO: do we need to replicate this in Tika? If we wind up
+        // parsing the html version of the email, this is duplicative??
+        // Or do we need to reset the header strings based on the html
+        // meta header if there is no other information?
+        try {
+            String html = msg.getHtmlBody();
+            if(html != null && html.length() > 0) {
+                Charset charset = null;
+                try {
+                    charset = detector.detect(new ByteArrayInputStream(
+                            html.getBytes(UTF_8)), EMPTY_METADATA);
+                } catch (IOException e) {
+                    //swallow
+                }
+                if (charset != null && tryToSet7BitEncoding(msg, charset.name())) {
+                    return;
+                }
+            }
+        } catch(ChunkNotFoundException e) {}
+
+        //absolute last resort, try charset detector
+        StringChunk text = mainChunks.textBodyChunk;
+        if (text != null) {
+            CharsetDetector detector = new CharsetDetector();
+            detector.setText(text.getRawValue());
+            CharsetMatch match = detector.detect();
+            if (match != null && match.getConfidence() > 35 &&
+                    tryToSet7BitEncoding(msg, match.getName())) {
+                return;
+            }
+        }
+    }
+
+    private boolean tryToSet7BitEncoding(MAPIMessage msg, String charsetName) {
+        if (charsetName == null) {
+            return false;
+        }
+
+        if (charsetName.equalsIgnoreCase("utf-8")) {
+            return false;
+        }
+        try {
+            if (Charset.isSupported(charsetName)) {
+                msg.set7BitEncoding(charsetName);
+                return true;
+            }
+        } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
+            //swallow
+        }
+        return false;
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,436 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.apache.tika.mime.MediaType.application;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.DocumentNode;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * A detector that works on a POIFS OLE2 document
+ * to figure out exactly what the file is.
+ * This should work for all OLE2 documents, whether
+ * they are ones supported by POI or not.
+ */
+public class POIFSContainerDetector implements Detector {
+
+    /**
+     * The OLE base file format
+     */
+    public static final MediaType OLE = application("x-tika-msoffice");
+    /**
+     * The protected OOXML base file format
+     */
+    public static final MediaType OOXML_PROTECTED = application("x-tika-ooxml-protected");
+    /**
+     * General embedded document type within an OLE2 container
+     */
+    public static final MediaType GENERAL_EMBEDDED = application("x-tika-msoffice-embedded");
+    /**
+     * An OLE10 Native embedded document within another OLE2 document
+     */
+    public static final MediaType OLE10_NATIVE =
+            new MediaType(GENERAL_EMBEDDED, "format", "ole10_native");
+    /**
+     * Some other kind of embedded document, in a CompObj container within another OLE2 document
+     */
+    public static final MediaType COMP_OBJ =
+            new MediaType(GENERAL_EMBEDDED, "format", "comp_obj");
+    /**
+     * Microsoft Excel
+     */
+    public static final MediaType XLS = application("vnd.ms-excel");
+    /**
+     * Microsoft Word
+     */
+    public static final MediaType DOC = application("msword");
+    /**
+     * Microsoft PowerPoint
+     */
+    public static final MediaType PPT = application("vnd.ms-powerpoint");
+    /**
+     * Microsoft Publisher
+     */
+    public static final MediaType PUB = application("x-mspublisher");
+    /**
+     * Microsoft Visio
+     */
+    public static final MediaType VSD = application("vnd.visio");
+    /**
+     * Microsoft Works
+     */
+    public static final MediaType WPS = application("vnd.ms-works");
+    /**
+     * Microsoft Works Spreadsheet 7.0
+     */
+    public static final MediaType XLR = application("x-tika-msworks-spreadsheet");
+    /**
+     * Microsoft Outlook
+     */
+    public static final MediaType MSG = application("vnd.ms-outlook");
+    /**
+     * Microsoft Project
+     */
+    public static final MediaType MPP = application("vnd.ms-project");
+    /**
+     * StarOffice Calc
+     */
+    public static final MediaType SDC = application("vnd.stardivision.calc");
+    /**
+     * StarOffice Draw
+     */
+    public static final MediaType SDA = application("vnd.stardivision.draw");
+    /**
+     * StarOffice Impress
+     */
+    public static final MediaType SDD = application("vnd.stardivision.impress");
+    /**
+     * StarOffice Writer
+     */
+    public static final MediaType SDW = application("vnd.stardivision.writer");
+    /**
+     * SolidWorks CAD file
+     */
+    public static final MediaType SLDWORKS = application("sldworks");
+    /**
+     * Hangul Word Processor (Korean)
+     */
+    public static final MediaType HWP = application("x-hwp-v5");
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -3028021741663605293L;
+    /**
+     * An ASCII String "StarImpress"
+     */
+    private static final byte[] STAR_IMPRESS = new byte[]{
+            0x53, 0x74, 0x61, 0x72, 0x49, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73
+    };
+    /**
+     * An ASCII String "StarDraw"
+     */
+    private static final byte[] STAR_DRAW = new byte[]{
+            0x53, 0x74, 0x61, 0x72, 0x44, 0x72, 0x61, 0x77
+    };
+    /**
+     * An ASCII String "Quill96" for Works Files
+     */
+    private static final byte[] WORKS_QUILL96 = new byte[]{
+            0x51, 0x75, 0x69, 0x6c, 0x6c, 0x39, 0x36
+    };
+    /**
+     * Regexp for matching the MPP Project Data stream
+     */
+    private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+");
+
+    /**
+     * Internal detection of the specific kind of OLE2 document, based on the
+     * names of the top level streams within the file.
+     *
+     * @deprecated Use {@link #detect(Set, DirectoryEntry)} and pass the root
+     * entry of the filesystem whose type is to be detected, as a
+     * second argument.
+     */
+    protected static MediaType detect(Set<String> names) {
+        return detect(names, null);
+    }
+
+    /**
+     * Internal detection of the specific kind of OLE2 document, based on the
+     * names of the top-level streams within the file. In some cases the
+     * detection may need access to the root {@link DirectoryEntry} of that file
+     * for best results. The entry can be given as a second, optional argument.
+     *
+     * @param names
+     * @param root
+     * @return
+     */
+    protected static MediaType detect(Set<String> names, DirectoryEntry root) {
+        if (names != null) {
+            if (names.contains("SwDocContentMgr") && names.contains("SwDocMgrTempStorage")) {
+                return SLDWORKS;
+            } else if (names.contains("StarCalcDocument")) {
+                // Star Office Calc
+                return SDC;
+            } else if (names.contains("StarWriterDocument")) {
+                return SDW;
+            } else if (names.contains("StarDrawDocument3")) {
+                if (root == null) {
+                    /*
+                     * This is either StarOfficeDraw or StarOfficeImpress, we have
+                     * to consult the CompObj to distinguish them, if this method is
+                     * called in "legacy mode", without the root, just return
+                     * x-tika-msoffice. The one-argument method is only for backward
+                     * compatibility, if someone calls old API he/she can get the
+                     * old result.
+                     */
+                    return OLE;
+                } else {
+                    return processCompObjFormatType(root);
+                }
+            } else if (names.contains("\u0005HwpSummaryInformation")) {
+                // Hangul Word Processor v5+ (previous aren't OLE2-based)
+                return HWP;
+            } else if (names.contains("WksSSWorkBook")) {
+                // This check has to be before names.contains("Workbook")
+                // Works 7.0 spreadsheet files contain both
+                // we want to avoid classifying this as Excel
+                return XLR;
+            } else if (names.contains("Workbook") || names.contains("WORKBOOK")) {
+                return XLS;
+            } else if (names.contains("Book")) {
+                // Excel 95 or older, we won't be able to parse this....
+                return XLS;
+            } else if (names.contains("EncryptedPackage") &&
+                    names.contains("EncryptionInfo") &&
+                    names.contains("\u0006DataSpaces")) {
+                // This is a protected OOXML document, which is an OLE2 file
+                //  with an Encrypted Stream which holds the OOXML data
+                // Without decrypting the stream, we can't tell what kind of
+                //  OOXML file we have. Return a general OOXML Protected type,
+                //  and hope the name based detection can guess the rest!
+                return OOXML_PROTECTED;
+            } else if (names.contains("EncryptedPackage")) {
+                return OLE;
+            } else if (names.contains("WordDocument")) {
+                return DOC;
+            } else if (names.contains("Quill")) {
+                return PUB;
+            } else if (names.contains("PowerPoint Document")) {
+                return PPT;
+            } else if (names.contains("VisioDocument")) {
+                return VSD;
+            } else if (names.contains("\u0001Ole10Native")) {
+                return OLE10_NATIVE;
+            } else if (names.contains("MatOST")) {
+                // this occurs on older Works Word Processor files (versions 3.0 and 4.0)
+                return WPS;
+            } else if (names.contains("CONTENTS") && names.contains("SPELLING")) {
+                // Newer Works files
+                return WPS;
+            } else if (names.contains("Contents") && names.contains("\u0003ObjInfo")) {
+                return COMP_OBJ;
+            } else if (names.contains("CONTENTS") && names.contains("\u0001CompObj")) {
+                // CompObj is a general kind of OLE2 embedding, but this may be an old Works file
+                // If we have the Directory, check
+                if (root != null) {
+                    MediaType type = processCompObjFormatType(root);
+                    if (type == WPS) {
+                        return WPS;
+                    } else {
+                        // Assume it's a general CompObj embedded resource
+                        return COMP_OBJ;
+                    }
+                } else {
+                    // Assume it's a general CompObj embedded resource
+                    return COMP_OBJ;
+                }
+            } else if (names.contains("CONTENTS")) {
+                // CONTENTS without SPELLING nor CompObj normally means some sort
+                //  of embedded non-office file inside an OLE2 document
+                // This is most commonly triggered on nested directories
+                return OLE;
+            } else if (names.contains("\u0001CompObj") &&
+                    (names.contains("Props") || names.contains("Props9") || names.contains("Props12"))) {
+                // Could be Project, look for common name patterns
+                for (String name : names) {
+                    if (mppDataMatch.matcher(name).matches()) {
+                        return MPP;
+                    }
+                }
+            } else if (names.contains("PerfectOffice_MAIN")) {
+                if (names.contains("SlideShow")) {
+                    return MediaType.application("x-corelpresentations"); // .shw
+                } else if (names.contains("PerfectOffice_OBJECTS")) {
+                    return MediaType.application("x-quattro-pro"); // .wb?
+                }
+            } else if (names.contains("NativeContent_MAIN")) {
+                return MediaType.application("x-quattro-pro"); // .qpw
+            } else {
+                for (String name : names) {
+                    if (name.startsWith("__substg1.0_")) {
+                        return MSG;
+                    }
+                }
+            }
+        }
+
+        // Couldn't detect a more specific type
+        return OLE;
+    }
+
+    /**
+     * Is this one of the kinds of formats which uses CompObj to
+     * store all of their data, eg Star Draw, Star Impress or
+     * (older) Works?
+     * If not, it's likely an embedded resource
+     */
+    private static MediaType processCompObjFormatType(DirectoryEntry root) {
+        try {
+            Entry e = root.getEntry("\u0001CompObj");
+            if (e != null && e.isDocumentEntry()) {
+                DocumentNode dn = (DocumentNode) e;
+                DocumentInputStream stream = new DocumentInputStream(dn);
+                byte[] bytes = IOUtils.toByteArray(stream);
+                /*
+                 * This array contains a string with a normal ASCII name of the
+                 * application used to create this file. We want to search for that
+                 * name.
+                 */
+                if (arrayContains(bytes, STAR_DRAW)) {
+                    return SDA;
+                } else if (arrayContains(bytes, STAR_IMPRESS)) {
+                    return SDD;
+                } else if (arrayContains(bytes, WORKS_QUILL96)) {
+                    return WPS;
+                }
+            }
+        } catch (Exception e) {
+            /*
+             * "root.getEntry" can throw FileNotFoundException. The code inside
+             * "if" can throw IOExceptions. Theoretically. Practically no
+             * exceptions will likely ever appear.
+             *
+             * Swallow all of them. If any occur, we just assume that we can't
+             * distinguish between Draw and Impress and return something safe:
+             * x-tika-msoffice
+             */
+        }
+        return OLE;
+    }
+
+    // poor man's search for byte arrays, replace with some library call if
+    // you know one without adding new dependencies
+    private static boolean arrayContains(byte[] larger, byte[] smaller) {
+        int largerCounter = 0;
+        int smallerCounter = 0;
+        while (largerCounter < larger.length) {
+            if (larger[largerCounter] == smaller[smallerCounter]) {
+                largerCounter++;
+                smallerCounter++;
+                if (smallerCounter == smaller.length) {
+                    return true;
+                }
+            } else {
+                largerCounter = largerCounter - smallerCounter + 1;
+                smallerCounter = 0;
+            }
+        }
+        return false;
+    }
+
+    private static Set<String> getTopLevelNames(TikaInputStream stream)
+            throws IOException {
+        // Force the document stream to a (possibly temporary) file
+        // so we don't modify the current position of the stream
+        File file = stream.getFile();
+
+        try {
+            NPOIFSFileSystem fs = new NPOIFSFileSystem(file, true);
+
+            // Optimize a possible later parsing process by keeping
+            // a reference to the already opened POI file system
+            stream.setOpenContainer(fs);
+
+            return getTopLevelNames(fs.getRoot());
+        } catch (IOException e) {
+            // Parse error in POI, so we don't know the file type
+            return Collections.emptySet();
+        } catch (RuntimeException e) {
+            // Another problem in POI
+            return Collections.emptySet();
+        }
+    }
+
+    private static Set<String> getTopLevelNames(DirectoryNode root) {
+        Set<String> names = new HashSet<String>();
+        for (Entry entry : root) {
+            names.add(entry.getName());
+        }
+        return names;
+    }
+
+    public MediaType detect(InputStream input, Metadata metadata)
+            throws IOException {
+        // Check if we have access to the document
+        if (input == null) {
+            return MediaType.OCTET_STREAM;
+        }
+
+        // If this is a TikaInputStream wrapping an already
+        // parsed NPOIFileSystem/DirectoryNode, just get the
+        // names from the root:
+        TikaInputStream tis = TikaInputStream.cast(input);
+        Set<String> names = null;
+        if (tis != null) {
+            Object container = tis.getOpenContainer();
+            if (container instanceof NPOIFSFileSystem) {
+                names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
+            } else if (container instanceof DirectoryNode) {
+                names = getTopLevelNames((DirectoryNode) container);
+            }
+        }
+
+        if (names == null) {
+            // Check if the document starts with the OLE header
+            input.mark(8);
+            try {
+                if (input.read() != 0xd0 || input.read() != 0xcf
+                        || input.read() != 0x11 || input.read() != 0xe0
+                        || input.read() != 0xa1 || input.read() != 0xb1
+                        || input.read() != 0x1a || input.read() != 0xe1) {
+                    return MediaType.OCTET_STREAM;
+                }
+            } finally {
+                input.reset();
+            }
+        }
+
+        // We can only detect the exact type when given a TikaInputStream
+        if (names == null && tis != null) {
+            // Look for known top level entry names to detect the document type
+            names = getTopLevelNames(tis);
+        }
+
+        // Detect based on the names (as available)
+        if (tis != null &&
+                tis.getOpenContainer() != null &&
+                tis.getOpenContainer() instanceof NPOIFSFileSystem) {
+            return detect(names, ((NPOIFSFileSystem) tis.getOpenContainer()).getRoot());
+        } else {
+            return detect(names, null);
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.hpsf.CustomProperties;
+import org.apache.poi.hpsf.DocumentSummaryInformation;
+import org.apache.poi.hpsf.MarkUnsupportedException;
+import org.apache.poi.hpsf.NoPropertySetStreamException;
+import org.apache.poi.hpsf.PropertySet;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.MSOffice;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+/**
+ * Extractor for Common OLE2 (HPSF) metadata
+ */
+public class SummaryExtractor {
+    private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class);
+
+    private static final String SUMMARY_INFORMATION =
+            SummaryInformation.DEFAULT_STREAM_NAME;
+
+    private static final String DOCUMENT_SUMMARY_INFORMATION =
+            DocumentSummaryInformation.DEFAULT_STREAM_NAME;
+
+    private final Metadata metadata;
+
+    public SummaryExtractor(Metadata metadata) {
+        this.metadata = metadata;
+    }
+
+    public void parseSummaries(NPOIFSFileSystem filesystem)
+            throws IOException, TikaException {
+        parseSummaries(filesystem.getRoot());
+    }
+
+    public void parseSummaries(DirectoryNode root)
+            throws IOException, TikaException {
+        parseSummaryEntryIfExists(root, SUMMARY_INFORMATION);
+        parseSummaryEntryIfExists(root, DOCUMENT_SUMMARY_INFORMATION);
+    }
+
+    private void parseSummaryEntryIfExists(
+            DirectoryNode root, String entryName)
+            throws IOException, TikaException {
+        try {
+            DocumentEntry entry =
+                    (DocumentEntry) root.getEntry(entryName);
+            PropertySet properties =
+                    new PropertySet(new DocumentInputStream(entry));
+            if (properties.isSummaryInformation()) {
+                parse(new SummaryInformation(properties));
+            }
+            if (properties.isDocumentSummaryInformation()) {
+                parse(new DocumentSummaryInformation(properties));
+            }
+        } catch (FileNotFoundException e) {
+            // entry does not exist, just skip it
+        } catch (NoPropertySetStreamException e) {
+            // no property stream, just skip it
+        } catch (UnexpectedPropertySetTypeException e) {
+            throw new TikaException("Unexpected HPSF document", e);
+        } catch (MarkUnsupportedException e) {
+            throw new TikaException("Invalid DocumentInputStream", e);
+        } catch (Exception e) {
+            logger.warn("Ignoring unexpected exception while parsing summary entry " + entryName, e);
+        }
+    }
+
+    private void parse(SummaryInformation summary) {
+        set(TikaCoreProperties.TITLE, summary.getTitle());
+        addMulti(metadata, TikaCoreProperties.CREATOR, summary.getAuthor());
+        set(TikaCoreProperties.KEYWORDS, summary.getKeywords());
+        // TODO Move to OO subject in Tika 2.0
+        set(TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, summary.getSubject());
+        set(TikaCoreProperties.MODIFIER, summary.getLastAuthor());
+        set(TikaCoreProperties.COMMENTS, summary.getComments());
+        set(OfficeOpenXMLExtended.TEMPLATE, summary.getTemplate());
+        set(OfficeOpenXMLExtended.APPLICATION, summary.getApplicationName());
+        set(OfficeOpenXMLCore.REVISION, summary.getRevNumber());
+        set(TikaCoreProperties.CREATED, summary.getCreateDateTime());
+        set(TikaCoreProperties.MODIFIED, summary.getLastSaveDateTime());
+        set(TikaCoreProperties.PRINT_DATE, summary.getLastPrinted());
+        set(Metadata.EDIT_TIME, summary.getEditTime());
+        set(OfficeOpenXMLExtended.DOC_SECURITY, summary.getSecurity());
+
+        // New style counts
+        set(Office.WORD_COUNT, summary.getWordCount());
+        set(Office.CHARACTER_COUNT, summary.getCharCount());
+        set(Office.PAGE_COUNT, summary.getPageCount());
+        if (summary.getPageCount() > 0) {
+            metadata.set(PagedText.N_PAGES, summary.getPageCount());
+        }
+
+        // Old style, Tika 1.0 properties
+        // TODO Remove these in Tika 2.0
+        set(Metadata.TEMPLATE, summary.getTemplate());
+        set(Metadata.APPLICATION_NAME, summary.getApplicationName());
+        set(Metadata.REVISION_NUMBER, summary.getRevNumber());
+        set(Metadata.SECURITY, summary.getSecurity());
+        set(MSOffice.WORD_COUNT, summary.getWordCount());
+        set(MSOffice.CHARACTER_COUNT, summary.getCharCount());
+        set(MSOffice.PAGE_COUNT, summary.getPageCount());
+    }
+
+    private void parse(DocumentSummaryInformation summary) {
+        set(OfficeOpenXMLExtended.COMPANY, summary.getCompany());
+        addMulti(metadata, OfficeOpenXMLExtended.MANAGER, summary.getManager());
+        set(TikaCoreProperties.LANGUAGE, getLanguage(summary));
+        set(OfficeOpenXMLCore.CATEGORY, summary.getCategory());
+
+        // New style counts
+        set(Office.SLIDE_COUNT, summary.getSlideCount());
+        if (summary.getSlideCount() > 0) {
+            metadata.set(PagedText.N_PAGES, summary.getSlideCount());
+        }
+        // Old style, Tika 1.0 counts
+        // TODO Remove these in Tika 2.0
+        set(Metadata.COMPANY, summary.getCompany());
+        set(Metadata.MANAGER, summary.getManager());
+        set(MSOffice.SLIDE_COUNT, summary.getSlideCount());
+        set(Metadata.CATEGORY, summary.getCategory());
+
+        parse(summary.getCustomProperties());
+    }
+
+    private String getLanguage(DocumentSummaryInformation summary) {
+        CustomProperties customProperties = summary.getCustomProperties();
+        if (customProperties != null) {
+            Object value = customProperties.get("Language");
+            if (value instanceof String) {
+                return (String) value;
+            }
+        }
+        return null;
+    }
+
+    /**
+     * Attempt to parse custom document properties and add to the collection of metadata
+     *
+     * @param customProperties
+     */
+    private void parse(CustomProperties customProperties) {
+        if (customProperties != null) {
+            for (String name : customProperties.nameSet()) {
+                // Apply the custom prefix
+                String key = Metadata.USER_DEFINED_METADATA_NAME_PREFIX + name;
+
+                // Get, convert and save property value
+                Object value = customProperties.get(name);
+                if (value instanceof String) {
+                    set(key, (String) value);
+                } else if (value instanceof Date) {
+                    Property prop = Property.externalDate(key);
+                    metadata.set(prop, (Date) value);
+                } else if (value instanceof Boolean) {
+                    Property prop = Property.externalBoolean(key);
+                    metadata.set(prop, value.toString());
+                } else if (value instanceof Long) {
+                    Property prop = Property.externalInteger(key);
+                    metadata.set(prop, ((Long) value).intValue());
+                } else if (value instanceof Double) {
+                    Property prop = Property.externalReal(key);
+                    metadata.set(prop, (Double) value);
+                } else if (value instanceof Integer) {
+                    Property prop = Property.externalInteger(key);
+                    metadata.set(prop, ((Integer) value).intValue());
+                }
+            }
+        }
+    }
+
+    private void set(String name, String value) {
+        if (value != null) {
+            metadata.set(name, value);
+        }
+    }
+
+    private void set(Property property, String value) {
+        if (value != null) {
+            metadata.set(property, value);
+        }
+    }
+
+    private void set(Property property, Date value) {
+        if (value != null) {
+            metadata.set(property, value);
+        }
+    }
+
+    private void set(Property property, int value) {
+        if (value > 0) {
+            metadata.set(property, value);
+        }
+    }
+
+    private void set(String name, long value) {
+        if (value > 0) {
+            metadata.set(name, Long.toString(value));
+        }
+    }
+
+    //MS stores values that should be multiple values (e.g. dc:creator)
+    //as a semicolon-delimited list.  We need to split
+    //on semicolon to add each value.
+    public static void addMulti(Metadata metadata, Property property, String string) {
+        if (string == null) {
+            return;
+        }
+        String[] parts = string.split(";");
+        String[] current = metadata.getValues(property);
+        Set<String> seen = new HashSet<>();
+        if (current != null) {
+            for (String val : current) {
+                seen.add(val);
+            }
+        }
+        for (String part : parts) {
+            if (! seen.contains(part)) {
+                metadata.add(property, part);
+                seen.add(part);
+            }
+        }
+    }
+
+}