You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC

svn commit: r1723223 [10/32] - in /tika/branches/2.x: tika-core/src/test/resources/META-INF/ tika-core/src/test/resources/META-INF/services/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-module/src/ ti...

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.hssf.extractor.OldExcelExtractor;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A POI-powered Tika Parser for very old versions of Excel, from
+ * pre-OLE2 days, such as Excel 4.
+ */
+public class OldExcelParser extends AbstractParser {
+    private static final long serialVersionUID = 4611820730372823452L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("vnd.ms-excel.sheet.4"),
+                    MediaType.application("vnd.ms-excel.workspace.4"),
+                    MediaType.application("vnd.ms-excel.sheet.3"),
+                    MediaType.application("vnd.ms-excel.workspace.3"),
+                    MediaType.application("vnd.ms-excel.sheet.2")
+            )));
+
+    protected static void parse(OldExcelExtractor extractor,
+                                XHTMLContentHandler xhtml) throws TikaException, IOException, SAXException {
+        // Get the whole text, as a single string
+        String text = extractor.getText();
+
+        // Split and output
+        xhtml.startDocument();
+
+        String line;
+        BufferedReader reader = new BufferedReader(new StringReader(text));
+        while ((line = reader.readLine()) != null) {
+            xhtml.startElement("p");
+            xhtml.characters(line);
+            xhtml.endElement("p");
+        }
+
+        xhtml.endDocument();
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * Extracts properties and text from an MS Document input stream
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // Open the POI provided extractor
+        OldExcelExtractor extractor = new OldExcelExtractor(stream);
+
+        // We can't do anything about metadata, as these old formats
+        //  didn't have any stored with them
+
+        // Set the content type
+        // TODO Get the version and type, to set as the Content Type
+
+        // Have the text extracted and given to our Content Handler
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        parse(extractor, xhtml);
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,386 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
+import java.text.ParseException;
+import java.util.Date;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.poi.hmef.attribute.MAPIRtfAttribute;
+import org.apache.poi.hsmf.MAPIMessage;
+import org.apache.poi.hsmf.datatypes.AttachmentChunks;
+import org.apache.poi.hsmf.datatypes.ByteChunk;
+import org.apache.poi.hsmf.datatypes.Chunk;
+import org.apache.poi.hsmf.datatypes.Chunks;
+import org.apache.poi.hsmf.datatypes.MAPIProperty;
+import org.apache.poi.hsmf.datatypes.PropertyValue;
+import org.apache.poi.hsmf.datatypes.StringChunk;
+import org.apache.poi.hsmf.datatypes.Types;
+import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.util.CodePageUtil;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.HtmlEncodingDetector;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.parser.mbox.MboxParser;
+import org.apache.tika.parser.rtf.RTFParser;
+import org.apache.tika.parser.txt.CharsetDetector;
+import org.apache.tika.parser.txt.CharsetMatch;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Outlook Message Parser.
+ */
+public class OutlookExtractor extends AbstractPOIFSExtractor {
+    private static final Metadata EMPTY_METADATA = new Metadata();
+    HtmlEncodingDetector detector = new HtmlEncodingDetector();
+
+    private final MAPIMessage msg;
+
+    public OutlookExtractor(NPOIFSFileSystem filesystem, ParseContext context) throws TikaException {
+        this(filesystem.getRoot(), context);
+    }
+
+    public OutlookExtractor(DirectoryNode root, ParseContext context) throws TikaException {
+        super(context);
+
+        try {
+            this.msg = new MAPIMessage(root);
+        } catch (IOException e) {
+            throw new TikaException("Failed to parse Outlook message", e);
+        }
+    }
+
+    public void parse(XHTMLContentHandler xhtml, Metadata metadata)
+            throws TikaException, SAXException, IOException {
+        try {
+            msg.setReturnNullOnMissingChunk(true);
+
+            // If the message contains strings that aren't stored
+            //  as Unicode, try to sort out an encoding for them
+            if (msg.has7BitEncodingStrings()) {
+                guess7BitEncoding(msg);
+            }
+
+            // Start with the metadata
+            String subject = msg.getSubject();
+            String from = msg.getDisplayFrom();
+
+            metadata.set(TikaCoreProperties.CREATOR, from);
+            metadata.set(Metadata.MESSAGE_FROM, from);
+            metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
+            metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
+            metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
+
+            metadata.set(TikaCoreProperties.TITLE, subject);
+            // TODO: Move to description in Tika 2.0
+            metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
+                    msg.getConversationTopic());
+
+            try {
+                for (String recipientAddress : msg.getRecipientEmailAddressList()) {
+                    if (recipientAddress != null)
+                        metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
+                }
+            } catch (ChunkNotFoundException he) {
+            } // Will be fixed in POI 3.7 Final
+
+            // Date - try two ways to find it
+            // First try via the proper chunk
+            if (msg.getMessageDate() != null) {
+                metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
+                metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
+            } else {
+                try {
+                    // Failing that try via the raw headers
+                    String[] headers = msg.getHeaders();
+                    if (headers != null && headers.length > 0) {
+                        for (String header : headers) {
+                            if (header.toLowerCase(Locale.ROOT).startsWith("date:")) {
+                                String date = header.substring(header.indexOf(':') + 1).trim();
+
+                                // See if we can parse it as a normal mail date
+                                try {
+                                    Date d = MboxParser.parseDate(date);
+                                    metadata.set(TikaCoreProperties.CREATED, d);
+                                    metadata.set(TikaCoreProperties.MODIFIED, d);
+                                } catch (ParseException e) {
+                                    // Store it as-is, and hope for the best...
+                                    metadata.set(TikaCoreProperties.CREATED, date);
+                                    metadata.set(TikaCoreProperties.MODIFIED, date);
+                                }
+                                break;
+                            }
+                        }
+                    }
+                } catch (ChunkNotFoundException he) {
+                    // We can't find the date, sorry...
+                }
+            }
+
+
+            xhtml.element("h1", subject);
+
+            // Output the from and to details in text, as you
+            //  often want them in text form for searching
+            xhtml.startElement("dl");
+            if (from != null) {
+                header(xhtml, "From", from);
+            }
+            header(xhtml, "To", msg.getDisplayTo());
+            header(xhtml, "Cc", msg.getDisplayCC());
+            header(xhtml, "Bcc", msg.getDisplayBCC());
+            try {
+                header(xhtml, "Recipients", msg.getRecipientEmailAddress());
+            } catch (ChunkNotFoundException e) {
+            }
+            xhtml.endElement("dl");
+
+            // Get the message body. Preference order is: html, rtf, text
+            Chunk htmlChunk = null;
+            Chunk rtfChunk = null;
+            Chunk textChunk = null;
+            for (Chunk chunk : msg.getMainChunks().getChunks()) {
+                if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
+                    htmlChunk = chunk;
+                }
+                if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
+                    rtfChunk = chunk;
+                }
+                if (chunk.getChunkId() == MAPIProperty.BODY.id) {
+                    textChunk = chunk;
+                }
+            }
+
+            boolean doneBody = false;
+            xhtml.startElement("div", "class", "message-body");
+            if (htmlChunk != null) {
+                byte[] data = null;
+                if (htmlChunk instanceof ByteChunk) {
+                    data = ((ByteChunk) htmlChunk).getValue();
+                } else if (htmlChunk instanceof StringChunk) {
+                    data = ((StringChunk) htmlChunk).getRawValue();
+                }
+                if (data != null) {
+                    HtmlParser htmlParser = new HtmlParser();
+                    htmlParser.parse(
+                            new ByteArrayInputStream(data),
+                            new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+                            new Metadata(), new ParseContext()
+                    );
+                    doneBody = true;
+                }
+            }
+            if (rtfChunk != null && !doneBody) {
+                ByteChunk chunk = (ByteChunk) rtfChunk;
+                MAPIRtfAttribute rtf = new MAPIRtfAttribute(
+                        MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()
+                );
+                RTFParser rtfParser = new RTFParser();
+                rtfParser.parse(
+                        new ByteArrayInputStream(rtf.getData()),
+                        new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+                        new Metadata(), new ParseContext());
+                doneBody = true;
+            }
+            if (textChunk != null && !doneBody) {
+                xhtml.element("p", ((StringChunk) textChunk).getValue());
+            }
+            xhtml.endElement("div");
+
+            // Process the attachments
+            for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
+                xhtml.startElement("div", "class", "attachment-entry");
+
+                String filename = null;
+                if (attachment.attachLongFileName != null) {
+                    filename = attachment.attachLongFileName.getValue();
+                } else if (attachment.attachFileName != null) {
+                    filename = attachment.attachFileName.getValue();
+                }
+                if (filename != null && filename.length() > 0) {
+                    xhtml.element("h1", filename);
+                }
+
+                if (attachment.attachData != null) {
+                    handleEmbeddedResource(
+                            TikaInputStream.get(attachment.attachData.getValue()),
+                            filename, null,
+                            null, xhtml, true
+                    );
+                }
+                if (attachment.attachmentDirectory != null) {
+                    handleEmbeddedOfficeDoc(
+                            attachment.attachmentDirectory.getDirectory(),
+                            xhtml
+                    );
+                }
+
+                xhtml.endElement("div");
+            }
+        } catch (ChunkNotFoundException e) {
+            throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
+        }
+    }
+
+    private void header(XHTMLContentHandler xhtml, String key, String value)
+            throws SAXException {
+        if (value != null && value.length() > 0) {
+            xhtml.element("dt", key);
+            xhtml.element("dd", value);
+        }
+    }
+
+    /**
+     * Tries to identify the correct encoding for 7-bit (non-unicode)
+     *  strings in the file.
+     * <p>Many messages store their strings as unicode, which is
+     *  nice and easy. Some use one-byte encodings for their
+     *  strings, but don't always store the encoding anywhere
+     *  helpful in the file.</p>
+     * <p>This method checks for codepage properties, and failing that
+     *  looks at the headers for the message, and uses these to
+     *  guess the correct encoding for your file.</p>
+     * <p>Bug #49441 has more on why this is needed</p>
+     * <p>This is taken verbatim from POI (TIKA-1238)
+     * as a temporary workaround to prevent unsupported encoding exceptions</p>
+     */
+    private void guess7BitEncoding(MAPIMessage msg) {
+        Chunks mainChunks = msg.getMainChunks();
+        //sanity check
+        if (mainChunks == null) {
+            return;
+        }
+
+        Map<MAPIProperty, List<PropertyValue>> props = mainChunks.getProperties();
+        if (props != null) {
+            // First choice is a codepage property
+            for (MAPIProperty prop : new MAPIProperty[]{
+                    MAPIProperty.MESSAGE_CODEPAGE,
+                    MAPIProperty.INTERNET_CPID
+            }) {
+                List<PropertyValue> val = props.get(prop);
+                if (val != null && val.size() > 0) {
+                    int codepage = ((PropertyValue.LongPropertyValue) val.get(0)).getValue();
+                    String encoding = null;
+                    try {
+                        encoding = CodePageUtil.codepageToEncoding(codepage, true);
+                    } catch (UnsupportedEncodingException e) {
+                        //swallow
+                    }
+                    if (tryToSet7BitEncoding(msg, encoding)) {
+                        return;
+                    }
+                }
+            }
+        }
+
+        // Second choice is a charset on a content type header
+        try {
+            String[] headers = msg.getHeaders();
+            if(headers != null && headers.length > 0) {
+                // Look for a content type with a charset
+                Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE);
+
+                for(String header : headers) {
+                    if(header.startsWith("Content-Type")) {
+                        Matcher m = p.matcher(header);
+                        if(m.matches()) {
+                            // Found it! Tell all the string chunks
+                            String charset = m.group(1);
+                            if (tryToSet7BitEncoding(msg, charset)) {
+                                return;
+                            }
+                        }
+                    }
+                }
+            }
+        } catch(ChunkNotFoundException e) {}
+
+        // Nothing suitable in the headers, try HTML
+        // TODO: do we need to replicate this in Tika? If we wind up
+        // parsing the html version of the email, this is duplicative??
+        // Or do we need to reset the header strings based on the html
+        // meta header if there is no other information?
+        try {
+            String html = msg.getHtmlBody();
+            if(html != null && html.length() > 0) {
+                Charset charset = null;
+                try {
+                    charset = detector.detect(new ByteArrayInputStream(
+                            html.getBytes(UTF_8)), EMPTY_METADATA);
+                } catch (IOException e) {
+                    //swallow
+                }
+                if (charset != null && tryToSet7BitEncoding(msg, charset.name())) {
+                    return;
+                }
+            }
+        } catch(ChunkNotFoundException e) {}
+
+        //absolute last resort, try charset detector
+        StringChunk text = mainChunks.textBodyChunk;
+        if (text != null) {
+            CharsetDetector detector = new CharsetDetector();
+            detector.setText(text.getRawValue());
+            CharsetMatch match = detector.detect();
+            if (match != null && match.getConfidence() > 35 &&
+                    tryToSet7BitEncoding(msg, match.getName())) {
+                return;
+            }
+        }
+    }
+
+    private boolean tryToSet7BitEncoding(MAPIMessage msg, String charsetName) {
+        if (charsetName == null) {
+            return false;
+        }
+
+        if (charsetName.equalsIgnoreCase("utf-8")) {
+            return false;
+        }
+        try {
+            if (Charset.isSupported(charsetName)) {
+                msg.set7BitEncoding(charsetName);
+                return true;
+            }
+        } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
+            //swallow
+        }
+        return false;
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,436 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.apache.tika.mime.MediaType.application;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.DocumentNode;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * A detector that works on a POIFS OLE2 document
+ * to figure out exactly what the file is.
+ * This should work for all OLE2 documents, whether
+ * they are ones supported by POI or not.
+ */
+public class POIFSContainerDetector implements Detector {
+
+    /**
+     * The OLE base file format
+     */
+    public static final MediaType OLE = application("x-tika-msoffice");
+    /**
+     * The protected OOXML base file format
+     */
+    public static final MediaType OOXML_PROTECTED = application("x-tika-ooxml-protected");
+    /**
+     * General embedded document type within an OLE2 container
+     */
+    public static final MediaType GENERAL_EMBEDDED = application("x-tika-msoffice-embedded");
+    /**
+     * An OLE10 Native embedded document within another OLE2 document
+     */
+    public static final MediaType OLE10_NATIVE =
+            new MediaType(GENERAL_EMBEDDED, "format", "ole10_native");
+    /**
+     * Some other kind of embedded document, in a CompObj container within another OLE2 document
+     */
+    public static final MediaType COMP_OBJ =
+            new MediaType(GENERAL_EMBEDDED, "format", "comp_obj");
+    /**
+     * Microsoft Excel
+     */
+    public static final MediaType XLS = application("vnd.ms-excel");
+    /**
+     * Microsoft Word
+     */
+    public static final MediaType DOC = application("msword");
+    /**
+     * Microsoft PowerPoint
+     */
+    public static final MediaType PPT = application("vnd.ms-powerpoint");
+    /**
+     * Microsoft Publisher
+     */
+    public static final MediaType PUB = application("x-mspublisher");
+    /**
+     * Microsoft Visio
+     */
+    public static final MediaType VSD = application("vnd.visio");
+    /**
+     * Microsoft Works
+     */
+    public static final MediaType WPS = application("vnd.ms-works");
+    /**
+     * Microsoft Works Spreadsheet 7.0
+     */
+    public static final MediaType XLR = application("x-tika-msworks-spreadsheet");
+    /**
+     * Microsoft Outlook
+     */
+    public static final MediaType MSG = application("vnd.ms-outlook");
+    /**
+     * Microsoft Project
+     */
+    public static final MediaType MPP = application("vnd.ms-project");
+    /**
+     * StarOffice Calc
+     */
+    public static final MediaType SDC = application("vnd.stardivision.calc");
+    /**
+     * StarOffice Draw
+     */
+    public static final MediaType SDA = application("vnd.stardivision.draw");
+    /**
+     * StarOffice Impress
+     */
+    public static final MediaType SDD = application("vnd.stardivision.impress");
+    /**
+     * StarOffice Writer
+     */
+    public static final MediaType SDW = application("vnd.stardivision.writer");
+    /**
+     * SolidWorks CAD file
+     */
+    public static final MediaType SLDWORKS = application("sldworks");
+    /**
+     * Hangul Word Processor (Korean)
+     */
+    public static final MediaType HWP = application("x-hwp-v5");
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -3028021741663605293L;
+    /**
+     * An ASCII String "StarImpress"
+     */
+    private static final byte[] STAR_IMPRESS = new byte[]{
+            0x53, 0x74, 0x61, 0x72, 0x49, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73
+    };
+    /**
+     * An ASCII String "StarDraw"
+     */
+    private static final byte[] STAR_DRAW = new byte[]{
+            0x53, 0x74, 0x61, 0x72, 0x44, 0x72, 0x61, 0x77
+    };
+    /**
+     * An ASCII String "Quill96" for Works Files
+     */
+    private static final byte[] WORKS_QUILL96 = new byte[]{
+            0x51, 0x75, 0x69, 0x6c, 0x6c, 0x39, 0x36
+    };
+    /**
+     * Regexp for matching the MPP Project Data stream
+     */
+    private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+");
+
+    /**
+     * Internal detection of the specific kind of OLE2 document, based on the
+     * names of the top level streams within the file.
+     *
+     * @deprecated Use {@link #detect(Set, DirectoryEntry)} and pass the root
+     * entry of the filesystem whose type is to be detected, as a
+     * second argument.
+     */
+    protected static MediaType detect(Set<String> names) {
+        return detect(names, null);
+    }
+
+    /**
+     * Internal detection of the specific kind of OLE2 document, based on the
+     * names of the top-level streams within the file. In some cases the
+     * detection may need access to the root {@link DirectoryEntry} of that file
+     * for best results. The entry can be given as a second, optional argument.
+     *
+     * @param names
+     * @param root
+     * @return
+     */
+    protected static MediaType detect(Set<String> names, DirectoryEntry root) {
+        if (names != null) {
+            if (names.contains("SwDocContentMgr") && names.contains("SwDocMgrTempStorage")) {
+                return SLDWORKS;
+            } else if (names.contains("StarCalcDocument")) {
+                // Star Office Calc
+                return SDC;
+            } else if (names.contains("StarWriterDocument")) {
+                return SDW;
+            } else if (names.contains("StarDrawDocument3")) {
+                if (root == null) {
+                    /*
+                     * This is either StarOfficeDraw or StarOfficeImpress, we have
+                     * to consult the CompObj to distinguish them, if this method is
+                     * called in "legacy mode", without the root, just return
+                     * x-tika-msoffice. The one-argument method is only for backward
+                     * compatibility, if someone calls old API he/she can get the
+                     * old result.
+                     */
+                    return OLE;
+                } else {
+                    return processCompObjFormatType(root);
+                }
+            } else if (names.contains("\u0005HwpSummaryInformation")) {
+                // Hangul Word Processor v5+ (previous aren't OLE2-based)
+                return HWP;
+            } else if (names.contains("WksSSWorkBook")) {
+                // This check has to be before names.contains("Workbook")
+                // Works 7.0 spreadsheet files contain both
+                // we want to avoid classifying this as Excel
+                return XLR;
+            } else if (names.contains("Workbook") || names.contains("WORKBOOK")) {
+                return XLS;
+            } else if (names.contains("Book")) {
+                // Excel 95 or older, we won't be able to parse this....
+                return XLS;
+            } else if (names.contains("EncryptedPackage") &&
+                    names.contains("EncryptionInfo") &&
+                    names.contains("\u0006DataSpaces")) {
+                // This is a protected OOXML document, which is an OLE2 file
+                //  with an Encrypted Stream which holds the OOXML data
+                // Without decrypting the stream, we can't tell what kind of
+                //  OOXML file we have. Return a general OOXML Protected type,
+                //  and hope the name based detection can guess the rest!
+                return OOXML_PROTECTED;
+            } else if (names.contains("EncryptedPackage")) {
+                return OLE;
+            } else if (names.contains("WordDocument")) {
+                return DOC;
+            } else if (names.contains("Quill")) {
+                return PUB;
+            } else if (names.contains("PowerPoint Document")) {
+                return PPT;
+            } else if (names.contains("VisioDocument")) {
+                return VSD;
+            } else if (names.contains("\u0001Ole10Native")) {
+                return OLE10_NATIVE;
+            } else if (names.contains("MatOST")) {
+                // this occurs on older Works Word Processor files (versions 3.0 and 4.0)
+                return WPS;
+            } else if (names.contains("CONTENTS") && names.contains("SPELLING")) {
+                // Newer Works files
+                return WPS;
+            } else if (names.contains("Contents") && names.contains("\u0003ObjInfo")) {
+                return COMP_OBJ;
+            } else if (names.contains("CONTENTS") && names.contains("\u0001CompObj")) {
+                // CompObj is a general kind of OLE2 embedding, but this may be an old Works file
+                // If we have the Directory, check
+                if (root != null) {
+                    MediaType type = processCompObjFormatType(root);
+                    if (type == WPS) {
+                        return WPS;
+                    } else {
+                        // Assume it's a general CompObj embedded resource
+                        return COMP_OBJ;
+                    }
+                } else {
+                    // Assume it's a general CompObj embedded resource
+                    return COMP_OBJ;
+                }
+            } else if (names.contains("CONTENTS")) {
+                // CONTENTS without SPELLING nor CompObj normally means some sort
+                //  of embedded non-office file inside an OLE2 document
+                // This is most commonly triggered on nested directories
+                return OLE;
+            } else if (names.contains("\u0001CompObj") &&
+                    (names.contains("Props") || names.contains("Props9") || names.contains("Props12"))) {
+                // Could be Project, look for common name patterns
+                for (String name : names) {
+                    if (mppDataMatch.matcher(name).matches()) {
+                        return MPP;
+                    }
+                }
+            } else if (names.contains("PerfectOffice_MAIN")) {
+                if (names.contains("SlideShow")) {
+                    return MediaType.application("x-corelpresentations"); // .shw
+                } else if (names.contains("PerfectOffice_OBJECTS")) {
+                    return MediaType.application("x-quattro-pro"); // .wb?
+                }
+            } else if (names.contains("NativeContent_MAIN")) {
+                return MediaType.application("x-quattro-pro"); // .qpw
+            } else {
+                for (String name : names) {
+                    if (name.startsWith("__substg1.0_")) {
+                        return MSG;
+                    }
+                }
+            }
+        }
+
+        // Couldn't detect a more specific type
+        return OLE;
+    }
+
+    /**
+     * Is this one of the kinds of formats which uses CompObj to
+     * store all of their data, eg Star Draw, Star Impress or
+     * (older) Works?
+     * If not, it's likely an embedded resource
+     */
+    private static MediaType processCompObjFormatType(DirectoryEntry root) {
+        try {
+            Entry e = root.getEntry("\u0001CompObj");
+            if (e != null && e.isDocumentEntry()) {
+                DocumentNode dn = (DocumentNode) e;
+                DocumentInputStream stream = new DocumentInputStream(dn);
+                byte[] bytes = IOUtils.toByteArray(stream);
+                /*
+                 * This array contains a string with a normal ASCII name of the
+                 * application used to create this file. We want to search for that
+                 * name.
+                 */
+                if (arrayContains(bytes, STAR_DRAW)) {
+                    return SDA;
+                } else if (arrayContains(bytes, STAR_IMPRESS)) {
+                    return SDD;
+                } else if (arrayContains(bytes, WORKS_QUILL96)) {
+                    return WPS;
+                }
+            }
+        } catch (Exception e) {
+            /*
+             * "root.getEntry" can throw FileNotFoundException. The code inside
+             * "if" can throw IOExceptions. Theoretically. Practically no
+             * exceptions will likely ever appear.
+             *
+             * Swallow all of them. If any occur, we just assume that we can't
+             * distinguish between Draw and Impress and return something safe:
+             * x-tika-msoffice
+             */
+        }
+        return OLE;
+    }
+
+    // poor man's search for byte arrays, replace with some library call if
+    // you know one without adding new dependencies
+    private static boolean arrayContains(byte[] larger, byte[] smaller) {
+        int largerCounter = 0;
+        int smallerCounter = 0;
+        while (largerCounter < larger.length) {
+            if (larger[largerCounter] == smaller[smallerCounter]) {
+                largerCounter++;
+                smallerCounter++;
+                if (smallerCounter == smaller.length) {
+                    return true;
+                }
+            } else {
+                largerCounter = largerCounter - smallerCounter + 1;
+                smallerCounter = 0;
+            }
+        }
+        return false;
+    }
+
+    private static Set<String> getTopLevelNames(TikaInputStream stream)
+            throws IOException {
+        // Force the document stream to a (possibly temporary) file
+        // so we don't modify the current position of the stream
+        File file = stream.getFile();
+
+        try {
+            NPOIFSFileSystem fs = new NPOIFSFileSystem(file, true);
+
+            // Optimize a possible later parsing process by keeping
+            // a reference to the already opened POI file system
+            stream.setOpenContainer(fs);
+
+            return getTopLevelNames(fs.getRoot());
+        } catch (IOException e) {
+            // Parse error in POI, so we don't know the file type
+            return Collections.emptySet();
+        } catch (RuntimeException e) {
+            // Another problem in POI
+            return Collections.emptySet();
+        }
+    }
+
+    private static Set<String> getTopLevelNames(DirectoryNode root) {
+        Set<String> names = new HashSet<String>();
+        for (Entry entry : root) {
+            names.add(entry.getName());
+        }
+        return names;
+    }
+
+    public MediaType detect(InputStream input, Metadata metadata)
+            throws IOException {
+        // Check if we have access to the document
+        if (input == null) {
+            return MediaType.OCTET_STREAM;
+        }
+
+        // If this is a TikaInputStream wrapping an already
+        // parsed NPOIFileSystem/DirectoryNode, just get the
+        // names from the root:
+        TikaInputStream tis = TikaInputStream.cast(input);
+        Set<String> names = null;
+        if (tis != null) {
+            Object container = tis.getOpenContainer();
+            if (container instanceof NPOIFSFileSystem) {
+                names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
+            } else if (container instanceof DirectoryNode) {
+                names = getTopLevelNames((DirectoryNode) container);
+            }
+        }
+
+        if (names == null) {
+            // Check if the document starts with the OLE header
+            input.mark(8);
+            try {
+                if (input.read() != 0xd0 || input.read() != 0xcf
+                        || input.read() != 0x11 || input.read() != 0xe0
+                        || input.read() != 0xa1 || input.read() != 0xb1
+                        || input.read() != 0x1a || input.read() != 0xe1) {
+                    return MediaType.OCTET_STREAM;
+                }
+            } finally {
+                input.reset();
+            }
+        }
+
+        // We can only detect the exact type when given a TikaInputStream
+        if (names == null && tis != null) {
+            // Look for known top level entry names to detect the document type
+            names = getTopLevelNames(tis);
+        }
+
+        // Detect based on the names (as available)
+        if (tis != null &&
+                tis.getOpenContainer() != null &&
+                tis.getOpenContainer() instanceof NPOIFSFileSystem) {
+            return detect(names, ((NPOIFSFileSystem) tis.getOpenContainer()).getRoot());
+        } else {
+            return detect(names, null);
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.hpsf.CustomProperties;
+import org.apache.poi.hpsf.DocumentSummaryInformation;
+import org.apache.poi.hpsf.MarkUnsupportedException;
+import org.apache.poi.hpsf.NoPropertySetStreamException;
+import org.apache.poi.hpsf.PropertySet;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.MSOffice;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+/**
+ * Extractor for Common OLE2 (HPSF) metadata
+ */
+public class SummaryExtractor {
+    private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class);
+
+    private static final String SUMMARY_INFORMATION =
+            SummaryInformation.DEFAULT_STREAM_NAME;
+
+    private static final String DOCUMENT_SUMMARY_INFORMATION =
+            DocumentSummaryInformation.DEFAULT_STREAM_NAME;
+
+    private final Metadata metadata;
+
+    public SummaryExtractor(Metadata metadata) {
+        this.metadata = metadata;
+    }
+
+    public void parseSummaries(NPOIFSFileSystem filesystem)
+            throws IOException, TikaException {
+        parseSummaries(filesystem.getRoot());
+    }
+
+    public void parseSummaries(DirectoryNode root)
+            throws IOException, TikaException {
+        parseSummaryEntryIfExists(root, SUMMARY_INFORMATION);
+        parseSummaryEntryIfExists(root, DOCUMENT_SUMMARY_INFORMATION);
+    }
+
+    private void parseSummaryEntryIfExists(
+            DirectoryNode root, String entryName)
+            throws IOException, TikaException {
+        try {
+            DocumentEntry entry =
+                    (DocumentEntry) root.getEntry(entryName);
+            PropertySet properties =
+                    new PropertySet(new DocumentInputStream(entry));
+            if (properties.isSummaryInformation()) {
+                parse(new SummaryInformation(properties));
+            }
+            if (properties.isDocumentSummaryInformation()) {
+                parse(new DocumentSummaryInformation(properties));
+            }
+        } catch (FileNotFoundException e) {
+            // entry does not exist, just skip it
+        } catch (NoPropertySetStreamException e) {
+            // no property stream, just skip it
+        } catch (UnexpectedPropertySetTypeException e) {
+            throw new TikaException("Unexpected HPSF document", e);
+        } catch (MarkUnsupportedException e) {
+            throw new TikaException("Invalid DocumentInputStream", e);
+        } catch (Exception e) {
+            logger.warn("Ignoring unexpected exception while parsing summary entry " + entryName, e);
+        }
+    }
+
+    private void parse(SummaryInformation summary) {
+        set(TikaCoreProperties.TITLE, summary.getTitle());
+        addMulti(metadata, TikaCoreProperties.CREATOR, summary.getAuthor());
+        set(TikaCoreProperties.KEYWORDS, summary.getKeywords());
+        // TODO Move to OO subject in Tika 2.0
+        set(TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, summary.getSubject());
+        set(TikaCoreProperties.MODIFIER, summary.getLastAuthor());
+        set(TikaCoreProperties.COMMENTS, summary.getComments());
+        set(OfficeOpenXMLExtended.TEMPLATE, summary.getTemplate());
+        set(OfficeOpenXMLExtended.APPLICATION, summary.getApplicationName());
+        set(OfficeOpenXMLCore.REVISION, summary.getRevNumber());
+        set(TikaCoreProperties.CREATED, summary.getCreateDateTime());
+        set(TikaCoreProperties.MODIFIED, summary.getLastSaveDateTime());
+        set(TikaCoreProperties.PRINT_DATE, summary.getLastPrinted());
+        set(Metadata.EDIT_TIME, summary.getEditTime());
+        set(OfficeOpenXMLExtended.DOC_SECURITY, summary.getSecurity());
+
+        // New style counts
+        set(Office.WORD_COUNT, summary.getWordCount());
+        set(Office.CHARACTER_COUNT, summary.getCharCount());
+        set(Office.PAGE_COUNT, summary.getPageCount());
+        if (summary.getPageCount() > 0) {
+            metadata.set(PagedText.N_PAGES, summary.getPageCount());
+        }
+
+        // Old style, Tika 1.0 properties
+        // TODO Remove these in Tika 2.0
+        set(Metadata.TEMPLATE, summary.getTemplate());
+        set(Metadata.APPLICATION_NAME, summary.getApplicationName());
+        set(Metadata.REVISION_NUMBER, summary.getRevNumber());
+        set(Metadata.SECURITY, summary.getSecurity());
+        set(MSOffice.WORD_COUNT, summary.getWordCount());
+        set(MSOffice.CHARACTER_COUNT, summary.getCharCount());
+        set(MSOffice.PAGE_COUNT, summary.getPageCount());
+    }
+
+    private void parse(DocumentSummaryInformation summary) {
+        set(OfficeOpenXMLExtended.COMPANY, summary.getCompany());
+        addMulti(metadata, OfficeOpenXMLExtended.MANAGER, summary.getManager());
+        set(TikaCoreProperties.LANGUAGE, getLanguage(summary));
+        set(OfficeOpenXMLCore.CATEGORY, summary.getCategory());
+
+        // New style counts
+        set(Office.SLIDE_COUNT, summary.getSlideCount());
+        if (summary.getSlideCount() > 0) {
+            metadata.set(PagedText.N_PAGES, summary.getSlideCount());
+        }
+        // Old style, Tika 1.0 counts
+        // TODO Remove these in Tika 2.0
+        set(Metadata.COMPANY, summary.getCompany());
+        set(Metadata.MANAGER, summary.getManager());
+        set(MSOffice.SLIDE_COUNT, summary.getSlideCount());
+        set(Metadata.CATEGORY, summary.getCategory());
+
+        parse(summary.getCustomProperties());
+    }
+
+    private String getLanguage(DocumentSummaryInformation summary) {
+        CustomProperties customProperties = summary.getCustomProperties();
+        if (customProperties != null) {
+            Object value = customProperties.get("Language");
+            if (value instanceof String) {
+                return (String) value;
+            }
+        }
+        return null;
+    }
+
+    /**
+     * Attempt to parse custom document properties and add to the collection of metadata
+     *
+     * @param customProperties
+     */
+    private void parse(CustomProperties customProperties) {
+        if (customProperties != null) {
+            for (String name : customProperties.nameSet()) {
+                // Apply the custom prefix
+                String key = Metadata.USER_DEFINED_METADATA_NAME_PREFIX + name;
+
+                // Get, convert and save property value
+                Object value = customProperties.get(name);
+                if (value instanceof String) {
+                    set(key, (String) value);
+                } else if (value instanceof Date) {
+                    Property prop = Property.externalDate(key);
+                    metadata.set(prop, (Date) value);
+                } else if (value instanceof Boolean) {
+                    Property prop = Property.externalBoolean(key);
+                    metadata.set(prop, value.toString());
+                } else if (value instanceof Long) {
+                    Property prop = Property.externalInteger(key);
+                    metadata.set(prop, ((Long) value).intValue());
+                } else if (value instanceof Double) {
+                    Property prop = Property.externalReal(key);
+                    metadata.set(prop, (Double) value);
+                } else if (value instanceof Integer) {
+                    Property prop = Property.externalInteger(key);
+                    metadata.set(prop, ((Integer) value).intValue());
+                }
+            }
+        }
+    }
+
+    private void set(String name, String value) {
+        if (value != null) {
+            metadata.set(name, value);
+        }
+    }
+
+    private void set(Property property, String value) {
+        if (value != null) {
+            metadata.set(property, value);
+        }
+    }
+
+    private void set(Property property, Date value) {
+        if (value != null) {
+            metadata.set(property, value);
+        }
+    }
+
+    private void set(Property property, int value) {
+        if (value > 0) {
+            metadata.set(property, value);
+        }
+    }
+
+    private void set(String name, long value) {
+        if (value > 0) {
+            metadata.set(name, Long.toString(value));
+        }
+    }
+
+    //MS stores values that should be multiple values (e.g. dc:creator)
+    //as a semicolon-delimited list.  We need to split
+    //on semicolon to add each value.
+    public static void addMulti(Metadata metadata, Property property, String string) {
+        if (string == null) {
+            return;
+        }
+        String[] parts = string.split(";");
+        String[] current = metadata.getValues(property);
+        Set<String> seen = new HashSet<>();
+        if (current != null) {
+            for (String val : current) {
+                seen.add(val);
+            }
+        }
+        for (String part : parts) {
+            if (! seen.contains(part)) {
+                metadata.add(property, part);
+                seen.add(part);
+            }
+        }
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.hmef.Attachment;
+import org.apache.poi.hmef.HMEFMessage;
+import org.apache.poi.hmef.attribute.MAPIAttribute;
+import org.apache.poi.hmef.attribute.MAPIRtfAttribute;
+import org.apache.poi.hsmf.datatypes.MAPIProperty;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A POI-powered Tika Parser for TNEF (Transport Neutral
+ * Encoding Format) messages, aka winmail.dat
+ */
+public class TNEFParser extends AbstractParser {
+    private static final long serialVersionUID = 4611820730372823452L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("vnd.ms-tnef"),
+                    MediaType.application("ms-tnef"),
+                    MediaType.application("x-tnef")
+            )));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * Extracts properties and text from an MS Document input stream
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        // We work by recursing, so get the appropriate bits
+        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+        EmbeddedDocumentExtractor embeddedExtractor;
+        if (ex == null) {
+            embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
+        } else {
+            embeddedExtractor = ex;
+        }
+
+        // Ask POI to process the file for us
+        HMEFMessage msg = new HMEFMessage(stream);
+
+        // Set the message subject if known
+        String subject = msg.getSubject();
+        if (subject != null && subject.length() > 0) {
+            // TODO: Move to title in Tika 2.0
+            metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, subject);
+        }
+
+        // Recurse into the message body RTF
+        MAPIAttribute attr = msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED);
+        if (attr != null && attr instanceof MAPIRtfAttribute) {
+            MAPIRtfAttribute rtf = (MAPIRtfAttribute) attr;
+            handleEmbedded(
+                    "message.rtf", "application/rtf",
+                    rtf.getData(),
+                    embeddedExtractor, handler
+            );
+        }
+
+        // Recurse into each attachment in turn
+        for (Attachment attachment : msg.getAttachments()) {
+            String name = attachment.getLongFilename();
+            if (name == null || name.length() == 0) {
+                name = attachment.getFilename();
+            }
+            if (name == null || name.length() == 0) {
+                String ext = attachment.getExtension();
+                if (ext != null) {
+                    name = "unknown" + ext;
+                }
+            }
+            handleEmbedded(
+                    name, null, attachment.getContents(),
+                    embeddedExtractor, handler
+            );
+        }
+    }
+
+    private void handleEmbedded(String name, String type, byte[] contents,
+                                EmbeddedDocumentExtractor embeddedExtractor, ContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        Metadata metadata = new Metadata();
+        if (name != null)
+            metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+        if (type != null)
+            metadata.set(Metadata.CONTENT_TYPE, type);
+
+        if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+            embeddedExtractor.parseEmbedded(
+                    TikaInputStream.get(contents),
+                    new EmbeddedContentHandler(handler),
+                    metadata, false);
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TextCell.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TextCell.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TextCell.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Text cell.
+ */
+public class TextCell implements Cell {
+
+    private final String text;
+
+    public TextCell(String text) {
+        this.text = text;
+    }
+
+    public void render(XHTMLContentHandler handler) throws SAXException {
+        handler.characters(text);
+    }
+
+    public String toString() {
+        return "Text Cell: \"" + text + "\"";
+    }
+}