You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC
svn commit: r1723223 [10/32] - in /tika/branches/2.x:
tika-core/src/test/resources/META-INF/
tika-core/src/test/resources/META-INF/services/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-module/src/ ti...
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.hssf.extractor.OldExcelExtractor;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A POI-powered Tika Parser for very old versions of Excel, from
+ * pre-OLE2 days, such as Excel 4.
+ */
+public class OldExcelParser extends AbstractParser {
+ private static final long serialVersionUID = 4611820730372823452L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("vnd.ms-excel.sheet.4"),
+ MediaType.application("vnd.ms-excel.workspace.4"),
+ MediaType.application("vnd.ms-excel.sheet.3"),
+ MediaType.application("vnd.ms-excel.workspace.3"),
+ MediaType.application("vnd.ms-excel.sheet.2")
+ )));
+
+ protected static void parse(OldExcelExtractor extractor,
+ XHTMLContentHandler xhtml) throws TikaException, IOException, SAXException {
+ // Get the whole text, as a single string
+ String text = extractor.getText();
+
+ // Split and output
+ xhtml.startDocument();
+
+ String line;
+ BufferedReader reader = new BufferedReader(new StringReader(text));
+ while ((line = reader.readLine()) != null) {
+ xhtml.startElement("p");
+ xhtml.characters(line);
+ xhtml.endElement("p");
+ }
+
+ xhtml.endDocument();
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /**
+ * Extracts properties and text from an MS Document input stream
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Open the POI provided extractor
+ OldExcelExtractor extractor = new OldExcelExtractor(stream);
+
+ // We can't do anything about metadata, as these old formats
+ // didn't have any stored with them
+
+ // Set the content type
+ // TODO Get the version and type, to set as the Content Type
+
+ // Have the text extracted and given to our Content Handler
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ parse(extractor, xhtml);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,386 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
+import java.text.ParseException;
+import java.util.Date;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.poi.hmef.attribute.MAPIRtfAttribute;
+import org.apache.poi.hsmf.MAPIMessage;
+import org.apache.poi.hsmf.datatypes.AttachmentChunks;
+import org.apache.poi.hsmf.datatypes.ByteChunk;
+import org.apache.poi.hsmf.datatypes.Chunk;
+import org.apache.poi.hsmf.datatypes.Chunks;
+import org.apache.poi.hsmf.datatypes.MAPIProperty;
+import org.apache.poi.hsmf.datatypes.PropertyValue;
+import org.apache.poi.hsmf.datatypes.StringChunk;
+import org.apache.poi.hsmf.datatypes.Types;
+import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.util.CodePageUtil;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.HtmlEncodingDetector;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.parser.mbox.MboxParser;
+import org.apache.tika.parser.rtf.RTFParser;
+import org.apache.tika.parser.txt.CharsetDetector;
+import org.apache.tika.parser.txt.CharsetMatch;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Outlook Message Parser.
+ */
+public class OutlookExtractor extends AbstractPOIFSExtractor {
+ private static final Metadata EMPTY_METADATA = new Metadata();
+ HtmlEncodingDetector detector = new HtmlEncodingDetector();
+
+ private final MAPIMessage msg;
+
+ public OutlookExtractor(NPOIFSFileSystem filesystem, ParseContext context) throws TikaException {
+ this(filesystem.getRoot(), context);
+ }
+
+ public OutlookExtractor(DirectoryNode root, ParseContext context) throws TikaException {
+ super(context);
+
+ try {
+ this.msg = new MAPIMessage(root);
+ } catch (IOException e) {
+ throw new TikaException("Failed to parse Outlook message", e);
+ }
+ }
+
+ public void parse(XHTMLContentHandler xhtml, Metadata metadata)
+ throws TikaException, SAXException, IOException {
+ try {
+ msg.setReturnNullOnMissingChunk(true);
+
+ // If the message contains strings that aren't stored
+ // as Unicode, try to sort out an encoding for them
+ if (msg.has7BitEncodingStrings()) {
+ guess7BitEncoding(msg);
+ }
+
+ // Start with the metadata
+ String subject = msg.getSubject();
+ String from = msg.getDisplayFrom();
+
+ metadata.set(TikaCoreProperties.CREATOR, from);
+ metadata.set(Metadata.MESSAGE_FROM, from);
+ metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
+ metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
+ metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
+
+ metadata.set(TikaCoreProperties.TITLE, subject);
+ // TODO: Move to description in Tika 2.0
+ metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
+ msg.getConversationTopic());
+
+ try {
+ for (String recipientAddress : msg.getRecipientEmailAddressList()) {
+ if (recipientAddress != null)
+ metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
+ }
+ } catch (ChunkNotFoundException he) {
+ } // Will be fixed in POI 3.7 Final
+
+ // Date - try two ways to find it
+ // First try via the proper chunk
+ if (msg.getMessageDate() != null) {
+ metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
+ metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
+ } else {
+ try {
+ // Failing that try via the raw headers
+ String[] headers = msg.getHeaders();
+ if (headers != null && headers.length > 0) {
+ for (String header : headers) {
+ if (header.toLowerCase(Locale.ROOT).startsWith("date:")) {
+ String date = header.substring(header.indexOf(':') + 1).trim();
+
+ // See if we can parse it as a normal mail date
+ try {
+ Date d = MboxParser.parseDate(date);
+ metadata.set(TikaCoreProperties.CREATED, d);
+ metadata.set(TikaCoreProperties.MODIFIED, d);
+ } catch (ParseException e) {
+ // Store it as-is, and hope for the best...
+ metadata.set(TikaCoreProperties.CREATED, date);
+ metadata.set(TikaCoreProperties.MODIFIED, date);
+ }
+ break;
+ }
+ }
+ }
+ } catch (ChunkNotFoundException he) {
+ // We can't find the date, sorry...
+ }
+ }
+
+
+ xhtml.element("h1", subject);
+
+ // Output the from and to details in text, as you
+ // often want them in text form for searching
+ xhtml.startElement("dl");
+ if (from != null) {
+ header(xhtml, "From", from);
+ }
+ header(xhtml, "To", msg.getDisplayTo());
+ header(xhtml, "Cc", msg.getDisplayCC());
+ header(xhtml, "Bcc", msg.getDisplayBCC());
+ try {
+ header(xhtml, "Recipients", msg.getRecipientEmailAddress());
+ } catch (ChunkNotFoundException e) {
+ }
+ xhtml.endElement("dl");
+
+ // Get the message body. Preference order is: html, rtf, text
+ Chunk htmlChunk = null;
+ Chunk rtfChunk = null;
+ Chunk textChunk = null;
+ for (Chunk chunk : msg.getMainChunks().getChunks()) {
+ if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
+ htmlChunk = chunk;
+ }
+ if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
+ rtfChunk = chunk;
+ }
+ if (chunk.getChunkId() == MAPIProperty.BODY.id) {
+ textChunk = chunk;
+ }
+ }
+
+ boolean doneBody = false;
+ xhtml.startElement("div", "class", "message-body");
+ if (htmlChunk != null) {
+ byte[] data = null;
+ if (htmlChunk instanceof ByteChunk) {
+ data = ((ByteChunk) htmlChunk).getValue();
+ } else if (htmlChunk instanceof StringChunk) {
+ data = ((StringChunk) htmlChunk).getRawValue();
+ }
+ if (data != null) {
+ HtmlParser htmlParser = new HtmlParser();
+ htmlParser.parse(
+ new ByteArrayInputStream(data),
+ new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+ new Metadata(), new ParseContext()
+ );
+ doneBody = true;
+ }
+ }
+ if (rtfChunk != null && !doneBody) {
+ ByteChunk chunk = (ByteChunk) rtfChunk;
+ MAPIRtfAttribute rtf = new MAPIRtfAttribute(
+ MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()
+ );
+ RTFParser rtfParser = new RTFParser();
+ rtfParser.parse(
+ new ByteArrayInputStream(rtf.getData()),
+ new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+ new Metadata(), new ParseContext());
+ doneBody = true;
+ }
+ if (textChunk != null && !doneBody) {
+ xhtml.element("p", ((StringChunk) textChunk).getValue());
+ }
+ xhtml.endElement("div");
+
+ // Process the attachments
+ for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
+ xhtml.startElement("div", "class", "attachment-entry");
+
+ String filename = null;
+ if (attachment.attachLongFileName != null) {
+ filename = attachment.attachLongFileName.getValue();
+ } else if (attachment.attachFileName != null) {
+ filename = attachment.attachFileName.getValue();
+ }
+ if (filename != null && filename.length() > 0) {
+ xhtml.element("h1", filename);
+ }
+
+ if (attachment.attachData != null) {
+ handleEmbeddedResource(
+ TikaInputStream.get(attachment.attachData.getValue()),
+ filename, null,
+ null, xhtml, true
+ );
+ }
+ if (attachment.attachmentDirectory != null) {
+ handleEmbeddedOfficeDoc(
+ attachment.attachmentDirectory.getDirectory(),
+ xhtml
+ );
+ }
+
+ xhtml.endElement("div");
+ }
+ } catch (ChunkNotFoundException e) {
+ throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
+ }
+ }
+
+ private void header(XHTMLContentHandler xhtml, String key, String value)
+ throws SAXException {
+ if (value != null && value.length() > 0) {
+ xhtml.element("dt", key);
+ xhtml.element("dd", value);
+ }
+ }
+
+ /**
+ * Tries to identify the correct encoding for 7-bit (non-unicode)
+ * strings in the file.
+ * <p>Many messages store their strings as unicode, which is
+ * nice and easy. Some use one-byte encodings for their
+ * strings, but don't always store the encoding anywhere
+ * helpful in the file.</p>
+ * <p>This method checks for codepage properties, and failing that
+ * looks at the headers for the message, and uses these to
+ * guess the correct encoding for your file.</p>
+ * <p>Bug #49441 has more on why this is needed</p>
+ * <p>This is taken verbatim from POI (TIKA-1238)
+ * as a temporary workaround to prevent unsupported encoding exceptions</p>
+ */
+ private void guess7BitEncoding(MAPIMessage msg) {
+ Chunks mainChunks = msg.getMainChunks();
+ //sanity check
+ if (mainChunks == null) {
+ return;
+ }
+
+ Map<MAPIProperty, List<PropertyValue>> props = mainChunks.getProperties();
+ if (props != null) {
+ // First choice is a codepage property
+ for (MAPIProperty prop : new MAPIProperty[]{
+ MAPIProperty.MESSAGE_CODEPAGE,
+ MAPIProperty.INTERNET_CPID
+ }) {
+ List<PropertyValue> val = props.get(prop);
+ if (val != null && val.size() > 0) {
+ int codepage = ((PropertyValue.LongPropertyValue) val.get(0)).getValue();
+ String encoding = null;
+ try {
+ encoding = CodePageUtil.codepageToEncoding(codepage, true);
+ } catch (UnsupportedEncodingException e) {
+ //swallow
+ }
+ if (tryToSet7BitEncoding(msg, encoding)) {
+ return;
+ }
+ }
+ }
+ }
+
+ // Second choice is a charset on a content type header
+ try {
+ String[] headers = msg.getHeaders();
+ if(headers != null && headers.length > 0) {
+ // Look for a content type with a charset
+ Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE);
+
+ for(String header : headers) {
+ if(header.startsWith("Content-Type")) {
+ Matcher m = p.matcher(header);
+ if(m.matches()) {
+ // Found it! Tell all the string chunks
+ String charset = m.group(1);
+ if (tryToSet7BitEncoding(msg, charset)) {
+ return;
+ }
+ }
+ }
+ }
+ }
+ } catch(ChunkNotFoundException e) {}
+
+ // Nothing suitable in the headers, try HTML
+ // TODO: do we need to replicate this in Tika? If we wind up
+ // parsing the html version of the email, this is duplicative??
+ // Or do we need to reset the header strings based on the html
+ // meta header if there is no other information?
+ try {
+ String html = msg.getHtmlBody();
+ if(html != null && html.length() > 0) {
+ Charset charset = null;
+ try {
+ charset = detector.detect(new ByteArrayInputStream(
+ html.getBytes(UTF_8)), EMPTY_METADATA);
+ } catch (IOException e) {
+ //swallow
+ }
+ if (charset != null && tryToSet7BitEncoding(msg, charset.name())) {
+ return;
+ }
+ }
+ } catch(ChunkNotFoundException e) {}
+
+ //absolute last resort, try charset detector
+ StringChunk text = mainChunks.textBodyChunk;
+ if (text != null) {
+ CharsetDetector detector = new CharsetDetector();
+ detector.setText(text.getRawValue());
+ CharsetMatch match = detector.detect();
+ if (match != null && match.getConfidence() > 35 &&
+ tryToSet7BitEncoding(msg, match.getName())) {
+ return;
+ }
+ }
+ }
+
+ private boolean tryToSet7BitEncoding(MAPIMessage msg, String charsetName) {
+ if (charsetName == null) {
+ return false;
+ }
+
+ if (charsetName.equalsIgnoreCase("utf-8")) {
+ return false;
+ }
+ try {
+ if (Charset.isSupported(charsetName)) {
+ msg.set7BitEncoding(charsetName);
+ return true;
+ }
+ } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
+ //swallow
+ }
+ return false;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,436 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.apache.tika.mime.MediaType.application;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.DocumentNode;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * A detector that works on a POIFS OLE2 document
+ * to figure out exactly what the file is.
+ * This should work for all OLE2 documents, whether
+ * they are ones supported by POI or not.
+ */
+public class POIFSContainerDetector implements Detector {
+
+ /**
+ * The OLE base file format
+ */
+ public static final MediaType OLE = application("x-tika-msoffice");
+ /**
+ * The protected OOXML base file format
+ */
+ public static final MediaType OOXML_PROTECTED = application("x-tika-ooxml-protected");
+ /**
+ * General embedded document type within an OLE2 container
+ */
+ public static final MediaType GENERAL_EMBEDDED = application("x-tika-msoffice-embedded");
+ /**
+ * An OLE10 Native embedded document within another OLE2 document
+ */
+ public static final MediaType OLE10_NATIVE =
+ new MediaType(GENERAL_EMBEDDED, "format", "ole10_native");
+ /**
+ * Some other kind of embedded document, in a CompObj container within another OLE2 document
+ */
+ public static final MediaType COMP_OBJ =
+ new MediaType(GENERAL_EMBEDDED, "format", "comp_obj");
+ /**
+ * Microsoft Excel
+ */
+ public static final MediaType XLS = application("vnd.ms-excel");
+ /**
+ * Microsoft Word
+ */
+ public static final MediaType DOC = application("msword");
+ /**
+ * Microsoft PowerPoint
+ */
+ public static final MediaType PPT = application("vnd.ms-powerpoint");
+ /**
+ * Microsoft Publisher
+ */
+ public static final MediaType PUB = application("x-mspublisher");
+ /**
+ * Microsoft Visio
+ */
+ public static final MediaType VSD = application("vnd.visio");
+ /**
+ * Microsoft Works
+ */
+ public static final MediaType WPS = application("vnd.ms-works");
+ /**
+ * Microsoft Works Spreadsheet 7.0
+ */
+ public static final MediaType XLR = application("x-tika-msworks-spreadsheet");
+ /**
+ * Microsoft Outlook
+ */
+ public static final MediaType MSG = application("vnd.ms-outlook");
+ /**
+ * Microsoft Project
+ */
+ public static final MediaType MPP = application("vnd.ms-project");
+ /**
+ * StarOffice Calc
+ */
+ public static final MediaType SDC = application("vnd.stardivision.calc");
+ /**
+ * StarOffice Draw
+ */
+ public static final MediaType SDA = application("vnd.stardivision.draw");
+ /**
+ * StarOffice Impress
+ */
+ public static final MediaType SDD = application("vnd.stardivision.impress");
+ /**
+ * StarOffice Writer
+ */
+ public static final MediaType SDW = application("vnd.stardivision.writer");
+ /**
+ * SolidWorks CAD file
+ */
+ public static final MediaType SLDWORKS = application("sldworks");
+ /**
+ * Hangul Word Processor (Korean)
+ */
+ public static final MediaType HWP = application("x-hwp-v5");
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -3028021741663605293L;
+ /**
+ * An ASCII String "StarImpress"
+ */
+ private static final byte[] STAR_IMPRESS = new byte[]{
+ 0x53, 0x74, 0x61, 0x72, 0x49, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73
+ };
+ /**
+ * An ASCII String "StarDraw"
+ */
+ private static final byte[] STAR_DRAW = new byte[]{
+ 0x53, 0x74, 0x61, 0x72, 0x44, 0x72, 0x61, 0x77
+ };
+ /**
+ * An ASCII String "Quill96" for Works Files
+ */
+ private static final byte[] WORKS_QUILL96 = new byte[]{
+ 0x51, 0x75, 0x69, 0x6c, 0x6c, 0x39, 0x36
+ };
+ /**
+ * Regexp for matching the MPP Project Data stream
+ */
+ private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+");
+
+ /**
+ * Internal detection of the specific kind of OLE2 document, based on the
+ * names of the top level streams within the file.
+ *
+ * @deprecated Use {@link #detect(Set, DirectoryEntry)} and pass the root
+ * entry of the filesystem whose type is to be detected, as a
+ * second argument.
+ */
+ protected static MediaType detect(Set<String> names) {
+ return detect(names, null);
+ }
+
+ /**
+ * Internal detection of the specific kind of OLE2 document, based on the
+ * names of the top-level streams within the file. In some cases the
+ * detection may need access to the root {@link DirectoryEntry} of that file
+ * for best results. The entry can be given as a second, optional argument.
+ *
+ * @param names
+ * @param root
+ * @return
+ */
+ protected static MediaType detect(Set<String> names, DirectoryEntry root) {
+ if (names != null) {
+ if (names.contains("SwDocContentMgr") && names.contains("SwDocMgrTempStorage")) {
+ return SLDWORKS;
+ } else if (names.contains("StarCalcDocument")) {
+ // Star Office Calc
+ return SDC;
+ } else if (names.contains("StarWriterDocument")) {
+ return SDW;
+ } else if (names.contains("StarDrawDocument3")) {
+ if (root == null) {
+ /*
+ * This is either StarOfficeDraw or StarOfficeImpress, we have
+ * to consult the CompObj to distinguish them, if this method is
+ * called in "legacy mode", without the root, just return
+ * x-tika-msoffice. The one-argument method is only for backward
+ * compatibility, if someone calls old API he/she can get the
+ * old result.
+ */
+ return OLE;
+ } else {
+ return processCompObjFormatType(root);
+ }
+ } else if (names.contains("\u0005HwpSummaryInformation")) {
+ // Hangul Word Processor v5+ (previous aren't OLE2-based)
+ return HWP;
+ } else if (names.contains("WksSSWorkBook")) {
+ // This check has to be before names.contains("Workbook")
+ // Works 7.0 spreadsheet files contain both
+ // we want to avoid classifying this as Excel
+ return XLR;
+ } else if (names.contains("Workbook") || names.contains("WORKBOOK")) {
+ return XLS;
+ } else if (names.contains("Book")) {
+ // Excel 95 or older, we won't be able to parse this....
+ return XLS;
+ } else if (names.contains("EncryptedPackage") &&
+ names.contains("EncryptionInfo") &&
+ names.contains("\u0006DataSpaces")) {
+ // This is a protected OOXML document, which is an OLE2 file
+ // with an Encrypted Stream which holds the OOXML data
+ // Without decrypting the stream, we can't tell what kind of
+ // OOXML file we have. Return a general OOXML Protected type,
+ // and hope the name based detection can guess the rest!
+ return OOXML_PROTECTED;
+ } else if (names.contains("EncryptedPackage")) {
+ return OLE;
+ } else if (names.contains("WordDocument")) {
+ return DOC;
+ } else if (names.contains("Quill")) {
+ return PUB;
+ } else if (names.contains("PowerPoint Document")) {
+ return PPT;
+ } else if (names.contains("VisioDocument")) {
+ return VSD;
+ } else if (names.contains("\u0001Ole10Native")) {
+ return OLE10_NATIVE;
+ } else if (names.contains("MatOST")) {
+ // this occurs on older Works Word Processor files (versions 3.0 and 4.0)
+ return WPS;
+ } else if (names.contains("CONTENTS") && names.contains("SPELLING")) {
+ // Newer Works files
+ return WPS;
+ } else if (names.contains("Contents") && names.contains("\u0003ObjInfo")) {
+ return COMP_OBJ;
+ } else if (names.contains("CONTENTS") && names.contains("\u0001CompObj")) {
+ // CompObj is a general kind of OLE2 embedding, but this may be an old Works file
+ // If we have the Directory, check
+ if (root != null) {
+ MediaType type = processCompObjFormatType(root);
+ if (type == WPS) {
+ return WPS;
+ } else {
+ // Assume it's a general CompObj embedded resource
+ return COMP_OBJ;
+ }
+ } else {
+ // Assume it's a general CompObj embedded resource
+ return COMP_OBJ;
+ }
+ } else if (names.contains("CONTENTS")) {
+ // CONTENTS without SPELLING nor CompObj normally means some sort
+ // of embedded non-office file inside an OLE2 document
+ // This is most commonly triggered on nested directories
+ return OLE;
+ } else if (names.contains("\u0001CompObj") &&
+ (names.contains("Props") || names.contains("Props9") || names.contains("Props12"))) {
+ // Could be Project, look for common name patterns
+ for (String name : names) {
+ if (mppDataMatch.matcher(name).matches()) {
+ return MPP;
+ }
+ }
+ } else if (names.contains("PerfectOffice_MAIN")) {
+ if (names.contains("SlideShow")) {
+ return MediaType.application("x-corelpresentations"); // .shw
+ } else if (names.contains("PerfectOffice_OBJECTS")) {
+ return MediaType.application("x-quattro-pro"); // .wb?
+ }
+ } else if (names.contains("NativeContent_MAIN")) {
+ return MediaType.application("x-quattro-pro"); // .qpw
+ } else {
+ for (String name : names) {
+ if (name.startsWith("__substg1.0_")) {
+ return MSG;
+ }
+ }
+ }
+ }
+
+ // Couldn't detect a more specific type
+ return OLE;
+ }
+
+ /**
+ * Is this one of the kinds of formats which uses CompObj to
+ * store all of their data, eg Star Draw, Star Impress or
+ * (older) Works?
+ * If not, it's likely an embedded resource
+ */
+ private static MediaType processCompObjFormatType(DirectoryEntry root) {
+ try {
+ Entry e = root.getEntry("\u0001CompObj");
+ if (e != null && e.isDocumentEntry()) {
+ DocumentNode dn = (DocumentNode) e;
+ DocumentInputStream stream = new DocumentInputStream(dn);
+ byte[] bytes = IOUtils.toByteArray(stream);
+ /*
+ * This array contains a string with a normal ASCII name of the
+ * application used to create this file. We want to search for that
+ * name.
+ */
+ if (arrayContains(bytes, STAR_DRAW)) {
+ return SDA;
+ } else if (arrayContains(bytes, STAR_IMPRESS)) {
+ return SDD;
+ } else if (arrayContains(bytes, WORKS_QUILL96)) {
+ return WPS;
+ }
+ }
+ } catch (Exception e) {
+ /*
+ * "root.getEntry" can throw FileNotFoundException. The code inside
+ * "if" can throw IOExceptions. Theoretically. Practically no
+ * exceptions will likely ever appear.
+ *
+ * Swallow all of them. If any occur, we just assume that we can't
+ * distinguish between Draw and Impress and return something safe:
+ * x-tika-msoffice
+ */
+ }
+ return OLE;
+ }
+
+ // poor man's search for byte arrays, replace with some library call if
+ // you know one without adding new dependencies
+ private static boolean arrayContains(byte[] larger, byte[] smaller) {
+ int largerCounter = 0;
+ int smallerCounter = 0;
+ while (largerCounter < larger.length) {
+ if (larger[largerCounter] == smaller[smallerCounter]) {
+ largerCounter++;
+ smallerCounter++;
+ if (smallerCounter == smaller.length) {
+ return true;
+ }
+ } else {
+ largerCounter = largerCounter - smallerCounter + 1;
+ smallerCounter = 0;
+ }
+ }
+ return false;
+ }
+
+ private static Set<String> getTopLevelNames(TikaInputStream stream)
+ throws IOException {
+ // Force the document stream to a (possibly temporary) file
+ // so we don't modify the current position of the stream
+ File file = stream.getFile();
+
+ try {
+ NPOIFSFileSystem fs = new NPOIFSFileSystem(file, true);
+
+ // Optimize a possible later parsing process by keeping
+ // a reference to the already opened POI file system
+ stream.setOpenContainer(fs);
+
+ return getTopLevelNames(fs.getRoot());
+ } catch (IOException e) {
+ // Parse error in POI, so we don't know the file type
+ return Collections.emptySet();
+ } catch (RuntimeException e) {
+ // Another problem in POI
+ return Collections.emptySet();
+ }
+ }
+
+ private static Set<String> getTopLevelNames(DirectoryNode root) {
+ Set<String> names = new HashSet<String>();
+ for (Entry entry : root) {
+ names.add(entry.getName());
+ }
+ return names;
+ }
+
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ // Check if we have access to the document
+ if (input == null) {
+ return MediaType.OCTET_STREAM;
+ }
+
+ // If this is a TikaInputStream wrapping an already
+ // parsed NPOIFileSystem/DirectoryNode, just get the
+ // names from the root:
+ TikaInputStream tis = TikaInputStream.cast(input);
+ Set<String> names = null;
+ if (tis != null) {
+ Object container = tis.getOpenContainer();
+ if (container instanceof NPOIFSFileSystem) {
+ names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
+ } else if (container instanceof DirectoryNode) {
+ names = getTopLevelNames((DirectoryNode) container);
+ }
+ }
+
+ if (names == null) {
+ // Check if the document starts with the OLE header
+ input.mark(8);
+ try {
+ if (input.read() != 0xd0 || input.read() != 0xcf
+ || input.read() != 0x11 || input.read() != 0xe0
+ || input.read() != 0xa1 || input.read() != 0xb1
+ || input.read() != 0x1a || input.read() != 0xe1) {
+ return MediaType.OCTET_STREAM;
+ }
+ } finally {
+ input.reset();
+ }
+ }
+
+ // We can only detect the exact type when given a TikaInputStream
+ if (names == null && tis != null) {
+ // Look for known top level entry names to detect the document type
+ names = getTopLevelNames(tis);
+ }
+
+ // Detect based on the names (as available)
+ if (tis != null &&
+ tis.getOpenContainer() != null &&
+ tis.getOpenContainer() instanceof NPOIFSFileSystem) {
+ return detect(names, ((NPOIFSFileSystem) tis.getOpenContainer()).getRoot());
+ } else {
+ return detect(names, null);
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.hpsf.CustomProperties;
+import org.apache.poi.hpsf.DocumentSummaryInformation;
+import org.apache.poi.hpsf.MarkUnsupportedException;
+import org.apache.poi.hpsf.NoPropertySetStreamException;
+import org.apache.poi.hpsf.PropertySet;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.MSOffice;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+/**
+ * Extractor for Common OLE2 (HPSF) metadata
+ */
+public class SummaryExtractor {
+ private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class);
+
+ private static final String SUMMARY_INFORMATION =
+ SummaryInformation.DEFAULT_STREAM_NAME;
+
+ private static final String DOCUMENT_SUMMARY_INFORMATION =
+ DocumentSummaryInformation.DEFAULT_STREAM_NAME;
+
+ private final Metadata metadata;
+
+ public SummaryExtractor(Metadata metadata) {
+ this.metadata = metadata;
+ }
+
+ public void parseSummaries(NPOIFSFileSystem filesystem)
+ throws IOException, TikaException {
+ parseSummaries(filesystem.getRoot());
+ }
+
+ public void parseSummaries(DirectoryNode root)
+ throws IOException, TikaException {
+ parseSummaryEntryIfExists(root, SUMMARY_INFORMATION);
+ parseSummaryEntryIfExists(root, DOCUMENT_SUMMARY_INFORMATION);
+ }
+
+ private void parseSummaryEntryIfExists(
+ DirectoryNode root, String entryName)
+ throws IOException, TikaException {
+ try {
+ DocumentEntry entry =
+ (DocumentEntry) root.getEntry(entryName);
+ PropertySet properties =
+ new PropertySet(new DocumentInputStream(entry));
+ if (properties.isSummaryInformation()) {
+ parse(new SummaryInformation(properties));
+ }
+ if (properties.isDocumentSummaryInformation()) {
+ parse(new DocumentSummaryInformation(properties));
+ }
+ } catch (FileNotFoundException e) {
+ // entry does not exist, just skip it
+ } catch (NoPropertySetStreamException e) {
+ // no property stream, just skip it
+ } catch (UnexpectedPropertySetTypeException e) {
+ throw new TikaException("Unexpected HPSF document", e);
+ } catch (MarkUnsupportedException e) {
+ throw new TikaException("Invalid DocumentInputStream", e);
+ } catch (Exception e) {
+ logger.warn("Ignoring unexpected exception while parsing summary entry " + entryName, e);
+ }
+ }
+
+ private void parse(SummaryInformation summary) {
+ set(TikaCoreProperties.TITLE, summary.getTitle());
+ addMulti(metadata, TikaCoreProperties.CREATOR, summary.getAuthor());
+ set(TikaCoreProperties.KEYWORDS, summary.getKeywords());
+ // TODO Move to OO subject in Tika 2.0
+ set(TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, summary.getSubject());
+ set(TikaCoreProperties.MODIFIER, summary.getLastAuthor());
+ set(TikaCoreProperties.COMMENTS, summary.getComments());
+ set(OfficeOpenXMLExtended.TEMPLATE, summary.getTemplate());
+ set(OfficeOpenXMLExtended.APPLICATION, summary.getApplicationName());
+ set(OfficeOpenXMLCore.REVISION, summary.getRevNumber());
+ set(TikaCoreProperties.CREATED, summary.getCreateDateTime());
+ set(TikaCoreProperties.MODIFIED, summary.getLastSaveDateTime());
+ set(TikaCoreProperties.PRINT_DATE, summary.getLastPrinted());
+ set(Metadata.EDIT_TIME, summary.getEditTime());
+ set(OfficeOpenXMLExtended.DOC_SECURITY, summary.getSecurity());
+
+ // New style counts
+ set(Office.WORD_COUNT, summary.getWordCount());
+ set(Office.CHARACTER_COUNT, summary.getCharCount());
+ set(Office.PAGE_COUNT, summary.getPageCount());
+ if (summary.getPageCount() > 0) {
+ metadata.set(PagedText.N_PAGES, summary.getPageCount());
+ }
+
+ // Old style, Tika 1.0 properties
+ // TODO Remove these in Tika 2.0
+ set(Metadata.TEMPLATE, summary.getTemplate());
+ set(Metadata.APPLICATION_NAME, summary.getApplicationName());
+ set(Metadata.REVISION_NUMBER, summary.getRevNumber());
+ set(Metadata.SECURITY, summary.getSecurity());
+ set(MSOffice.WORD_COUNT, summary.getWordCount());
+ set(MSOffice.CHARACTER_COUNT, summary.getCharCount());
+ set(MSOffice.PAGE_COUNT, summary.getPageCount());
+ }
+
+ private void parse(DocumentSummaryInformation summary) {
+ set(OfficeOpenXMLExtended.COMPANY, summary.getCompany());
+ addMulti(metadata, OfficeOpenXMLExtended.MANAGER, summary.getManager());
+ set(TikaCoreProperties.LANGUAGE, getLanguage(summary));
+ set(OfficeOpenXMLCore.CATEGORY, summary.getCategory());
+
+ // New style counts
+ set(Office.SLIDE_COUNT, summary.getSlideCount());
+ if (summary.getSlideCount() > 0) {
+ metadata.set(PagedText.N_PAGES, summary.getSlideCount());
+ }
+ // Old style, Tika 1.0 counts
+ // TODO Remove these in Tika 2.0
+ set(Metadata.COMPANY, summary.getCompany());
+ set(Metadata.MANAGER, summary.getManager());
+ set(MSOffice.SLIDE_COUNT, summary.getSlideCount());
+ set(Metadata.CATEGORY, summary.getCategory());
+
+ parse(summary.getCustomProperties());
+ }
+
+ private String getLanguage(DocumentSummaryInformation summary) {
+ CustomProperties customProperties = summary.getCustomProperties();
+ if (customProperties != null) {
+ Object value = customProperties.get("Language");
+ if (value instanceof String) {
+ return (String) value;
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Attempt to parse custom document properties and add to the collection of metadata
+ *
+ * @param customProperties
+ */
+ private void parse(CustomProperties customProperties) {
+ if (customProperties != null) {
+ for (String name : customProperties.nameSet()) {
+ // Apply the custom prefix
+ String key = Metadata.USER_DEFINED_METADATA_NAME_PREFIX + name;
+
+ // Get, convert and save property value
+ Object value = customProperties.get(name);
+ if (value instanceof String) {
+ set(key, (String) value);
+ } else if (value instanceof Date) {
+ Property prop = Property.externalDate(key);
+ metadata.set(prop, (Date) value);
+ } else if (value instanceof Boolean) {
+ Property prop = Property.externalBoolean(key);
+ metadata.set(prop, value.toString());
+ } else if (value instanceof Long) {
+ Property prop = Property.externalInteger(key);
+ metadata.set(prop, ((Long) value).intValue());
+ } else if (value instanceof Double) {
+ Property prop = Property.externalReal(key);
+ metadata.set(prop, (Double) value);
+ } else if (value instanceof Integer) {
+ Property prop = Property.externalInteger(key);
+ metadata.set(prop, ((Integer) value).intValue());
+ }
+ }
+ }
+ }
+
+ private void set(String name, String value) {
+ if (value != null) {
+ metadata.set(name, value);
+ }
+ }
+
+ private void set(Property property, String value) {
+ if (value != null) {
+ metadata.set(property, value);
+ }
+ }
+
+ private void set(Property property, Date value) {
+ if (value != null) {
+ metadata.set(property, value);
+ }
+ }
+
+ private void set(Property property, int value) {
+ if (value > 0) {
+ metadata.set(property, value);
+ }
+ }
+
+ private void set(String name, long value) {
+ if (value > 0) {
+ metadata.set(name, Long.toString(value));
+ }
+ }
+
+ //MS stores values that should be multiple values (e.g. dc:creator)
+ //as a semicolon-delimited list. We need to split
+ //on semicolon to add each value.
+ public static void addMulti(Metadata metadata, Property property, String string) {
+ if (string == null) {
+ return;
+ }
+ String[] parts = string.split(";");
+ String[] current = metadata.getValues(property);
+ Set<String> seen = new HashSet<>();
+ if (current != null) {
+ for (String val : current) {
+ seen.add(val);
+ }
+ }
+ for (String part : parts) {
+ if (! seen.contains(part)) {
+ metadata.add(property, part);
+ seen.add(part);
+ }
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.hmef.Attachment;
+import org.apache.poi.hmef.HMEFMessage;
+import org.apache.poi.hmef.attribute.MAPIAttribute;
+import org.apache.poi.hmef.attribute.MAPIRtfAttribute;
+import org.apache.poi.hsmf.datatypes.MAPIProperty;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A POI-powered Tika Parser for TNEF (Transport Neutral
+ * Encoding Format) messages, aka winmail.dat
+ */
+public class TNEFParser extends AbstractParser {
+ private static final long serialVersionUID = 4611820730372823452L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("vnd.ms-tnef"),
+ MediaType.application("ms-tnef"),
+ MediaType.application("x-tnef")
+ )));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /**
+ * Extracts properties and text from an MS Document input stream
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ // We work by recursing, so get the appropriate bits
+ EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+ EmbeddedDocumentExtractor embeddedExtractor;
+ if (ex == null) {
+ embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
+ } else {
+ embeddedExtractor = ex;
+ }
+
+ // Ask POI to process the file for us
+ HMEFMessage msg = new HMEFMessage(stream);
+
+ // Set the message subject if known
+ String subject = msg.getSubject();
+ if (subject != null && subject.length() > 0) {
+ // TODO: Move to title in Tika 2.0
+ metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, subject);
+ }
+
+ // Recurse into the message body RTF
+ MAPIAttribute attr = msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED);
+ if (attr != null && attr instanceof MAPIRtfAttribute) {
+ MAPIRtfAttribute rtf = (MAPIRtfAttribute) attr;
+ handleEmbedded(
+ "message.rtf", "application/rtf",
+ rtf.getData(),
+ embeddedExtractor, handler
+ );
+ }
+
+ // Recurse into each attachment in turn
+ for (Attachment attachment : msg.getAttachments()) {
+ String name = attachment.getLongFilename();
+ if (name == null || name.length() == 0) {
+ name = attachment.getFilename();
+ }
+ if (name == null || name.length() == 0) {
+ String ext = attachment.getExtension();
+ if (ext != null) {
+ name = "unknown" + ext;
+ }
+ }
+ handleEmbedded(
+ name, null, attachment.getContents(),
+ embeddedExtractor, handler
+ );
+ }
+ }
+
+ private void handleEmbedded(String name, String type, byte[] contents,
+ EmbeddedDocumentExtractor embeddedExtractor, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ Metadata metadata = new Metadata();
+ if (name != null)
+ metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+ if (type != null)
+ metadata.set(Metadata.CONTENT_TYPE, type);
+
+ if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+ embeddedExtractor.parseEmbedded(
+ TikaInputStream.get(contents),
+ new EmbeddedContentHandler(handler),
+ metadata, false);
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TextCell.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TextCell.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/TextCell.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Text cell.
+ */
+public class TextCell implements Cell {
+
+ private final String text;
+
+ public TextCell(String text) {
+ this.text = text;
+ }
+
+ public void render(XHTMLContentHandler handler) throws SAXException {
+ handler.characters(text);
+ }
+
+ public String toString() {
+ return "Text Cell: \"" + text + "\"";
+ }
+}