You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/09 03:03:06 UTC

svn commit: r1723820 - in /tika/branches/2.x/tika-parser-modules: tika-office-module/ tika-office-module/src/main/java/org/apache/tika/parser/mbox/ tika-office-module/src/test/java/org/apache/tika/parser/mbox/ tika-web-module/ tika-web-module/src/main/...

Author: bob
Date: Sat Jan  9 02:03:05 2016
New Revision: 1723820

URL: http://svn.apache.org/viewvc?rev=1723820&view=rev
Log:
TIKA-1824 - Move mbox to office.

Added:
    tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/
    tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
    tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
    tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/
    tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
    tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
Removed:
    tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/
    tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/
Modified:
    tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml
    tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml

Modified: tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml?rev=1723820&r1=1723819&r2=1723820&view=diff
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml (original)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml Sat Jan  9 02:03:05 2016
@@ -73,6 +73,11 @@
       <version>2.1.1</version>
     </dependency>
     <dependency>
+      <groupId>com.pff</groupId>
+      <artifactId>java-libpst</artifactId>
+      <version>0.8.1</version>
+    </dependency>
+    <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-package-module</artifactId>
       <version>${project.version}</version>

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=1723820&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java Sat Jan  9 02:03:05 2016
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Queue;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Mbox (mailbox) parser. This version extracts each mail from Mbox and uses the
+ * DelegatingParser to process each mail.
+ */
+public class MboxParser extends AbstractParser {
+
+    public static final String MBOX_MIME_TYPE = "application/mbox";
+    public static final String MBOX_RECORD_DIVIDER = "From ";
+    public static final int MAIL_MAX_SIZE = 50000000;
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -1762689436731160661L;
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("mbox"));
+    private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)");
+    private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>");
+
+    private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-";
+    private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from";
+    private final Map<Integer, Metadata> trackingMetadata = new HashMap<Integer, Metadata>();
+    private boolean tracking = false;
+
+    public static Date parseDate(String headerContent) throws ParseException {
+        SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
+        return dateFormat.parse(headerContent);
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, TikaException, SAXException {
+
+        EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class,
+                new ParsingEmbeddedDocumentExtractor(context));
+
+        String charsetName = "windows-1252";
+
+        metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
+        metadata.set(Metadata.CONTENT_ENCODING, charsetName);
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        InputStreamReader isr = new InputStreamReader(stream, charsetName);
+        try (BufferedReader reader = new BufferedReader(isr)) {
+            String curLine = reader.readLine();
+            int mailItem = 0;
+            do {
+                if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
+                    Metadata mailMetadata = new Metadata();
+                    Queue<String> multiline = new LinkedList<String>();
+                    mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
+                    mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
+                    curLine = reader.readLine();
+
+                    ByteArrayOutputStream message = new ByteArrayOutputStream(100000);
+                    do {
+                        if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
+                            String latestLine = multiline.poll();
+                            latestLine += " " + curLine.trim();
+                            multiline.add(latestLine);
+                        } else {
+                            multiline.add(curLine);
+                        }
+
+                        message.write(curLine.getBytes(charsetName));
+                        message.write(0x0A);
+                        curLine = reader.readLine();
+                    }
+                    while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);
+
+                    for (String item : multiline) {
+                        saveHeaderInMetadata(mailMetadata, item);
+                    }
+
+                    ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray());
+                    message = null;
+
+                    if (extractor.shouldParseEmbedded(mailMetadata)) {
+                        extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true);
+                    }
+
+                    if (tracking) {
+                        getTrackingMetadata().put(mailItem++, mailMetadata);
+                    }
+                } else {
+                    curLine = reader.readLine();
+                }
+
+            } while (curLine != null && !Thread.currentThread().isInterrupted());
+        }
+
+        xhtml.endDocument();
+    }
+
+    public boolean isTracking() {
+        return tracking;
+    }
+
+    public void setTracking(boolean tracking) {
+        this.tracking = tracking;
+    }
+
+    public Map<Integer, Metadata> getTrackingMetadata() {
+        return trackingMetadata;
+    }
+
+    private void saveHeaderInMetadata(Metadata metadata, String curLine) {
+        Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
+        if (!headerMatcher.matches()) {
+            return; // ignore malformed header lines
+        }
+
+        String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT);
+        String headerContent = headerMatcher.group(2);
+
+        if (headerTag.equalsIgnoreCase("From")) {
+            metadata.set(TikaCoreProperties.CREATOR, headerContent);
+        } else if (headerTag.equalsIgnoreCase("To") || headerTag.equalsIgnoreCase("Cc")
+                || headerTag.equalsIgnoreCase("Bcc")) {
+            Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent);
+            if (address.find()) {
+                metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1));
+            } else if (headerContent.indexOf('@') > -1) {
+                metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent);
+            }
+
+            String property = Metadata.MESSAGE_TO;
+            if (headerTag.equalsIgnoreCase("Cc")) {
+                property = Metadata.MESSAGE_CC;
+            } else if (headerTag.equalsIgnoreCase("Bcc")) {
+                property = Metadata.MESSAGE_BCC;
+            }
+            metadata.add(property, headerContent);
+        } else if (headerTag.equalsIgnoreCase("Subject")) {
+            metadata.add(Metadata.SUBJECT, headerContent);
+        } else if (headerTag.equalsIgnoreCase("Date")) {
+            try {
+                Date date = parseDate(headerContent);
+                metadata.set(TikaCoreProperties.CREATED, date);
+            } catch (ParseException e) {
+                // ignoring date because format was not understood
+            }
+        } else if (headerTag.equalsIgnoreCase("Message-Id")) {
+            metadata.set(TikaCoreProperties.IDENTIFIER, headerContent);
+        } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
+            metadata.set(TikaCoreProperties.RELATION, headerContent);
+        } else if (headerTag.equalsIgnoreCase("Content-Type")) {
+            // TODO - key off content-type in headers to
+            // set mapping to use for content and convert if necessary.
+
+            metadata.add(Metadata.CONTENT_TYPE, headerContent);
+            metadata.set(TikaCoreProperties.FORMAT, headerContent);
+        } else {
+            metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java?rev=1723820&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java Sat Jan  9 02:03:05 2016
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static java.lang.String.valueOf;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.util.Collections.singleton;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+
+import com.pff.PSTAttachment;
+import com.pff.PSTFile;
+import com.pff.PSTFolder;
+import com.pff.PSTMessage;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Parser for MS Outlook PST email storage files
+ */
+public class OutlookPSTParser extends AbstractParser {
+
+    private static final long serialVersionUID = 620998217748364063L;
+
+    public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst");
+    private static final Set<MediaType> SUPPORTED_TYPES = singleton(MS_OUTLOOK_PST_MIMETYPE);
+
+    private static AttributesImpl createAttribute(String attName, String attValue) {
+        AttributesImpl attributes = new AttributesImpl();
+        attributes.addAttribute("", attName, attName, "CDATA", attValue);
+        return attributes;
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        // Use the delegate parser to parse the contained document
+        EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class,
+                new ParsingEmbeddedDocumentExtractor(context));
+
+        metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString());
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        TikaInputStream in = TikaInputStream.get(stream);
+        PSTFile pstFile = null;
+        try {
+            pstFile = new PSTFile(in.getFile().getPath());
+            metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length()));
+            boolean isValid = pstFile.getFileHandle().getFD().valid();
+            metadata.set("isValid", valueOf(isValid));
+            if (isValid) {
+                parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
+            }
+        } catch (Exception e) {
+            throw new TikaException(e.getMessage(), e);
+        } finally {
+            if (pstFile != null && pstFile.getFileHandle() != null) {
+                try {
+                    pstFile.getFileHandle().close();
+                } catch (IOException e) {
+                    //swallow closing exception
+                }
+            }
+        }
+
+        xhtml.endDocument();
+    }
+
+    private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, EmbeddedDocumentExtractor embeddedExtractor)
+            throws Exception {
+        if (pstFolder.getContentCount() > 0) {
+            PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
+            while (pstMail != null) {
+                AttributesImpl attributes = new AttributesImpl();
+                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+                attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
+                handler.startElement("div", attributes);
+                handler.element("h1", pstMail.getSubject());
+
+                parserMailItem(handler, pstMail, embeddedExtractor);
+                parseMailAttachments(handler, pstMail, embeddedExtractor);
+
+                handler.endElement("div");
+
+                pstMail = (PSTMessage) pstFolder.getNextChild();
+            }
+        }
+
+        if (pstFolder.hasSubfolders()) {
+            for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) {
+                handler.startElement("div", createAttribute("class", "email-folder"));
+                handler.element("h1", pstSubFolder.getDisplayName());
+                parseFolder(handler, pstSubFolder, embeddedExtractor);
+                handler.endElement("div");
+            }
+        }
+    }
+
+    private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
+        Metadata mailMetadata = new Metadata();
+        mailMetadata.set(Metadata.RESOURCE_NAME_KEY, pstMail.getInternetMessageId());
+        mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId());
+        mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId());
+        mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject());
+        mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName());
+        mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName());
+        mailMetadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime());
+        mailMetadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime());
+        mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment());
+        mailMetadata.set("descriptorNodeId", valueOf(pstMail.getDescriptorNodeId()));
+        mailMetadata.set("senderEmailAddress", pstMail.getSenderEmailAddress());
+        mailMetadata.set("recipients", pstMail.getRecipientsString());
+        mailMetadata.set("displayTo", pstMail.getDisplayTo());
+        mailMetadata.set("displayCC", pstMail.getDisplayCC());
+        mailMetadata.set("displayBCC", pstMail.getDisplayBCC());
+        mailMetadata.set("importance", valueOf(pstMail.getImportance()));
+        mailMetadata.set("priority", valueOf(pstMail.getPriority()));
+        mailMetadata.set("flagged", valueOf(pstMail.isFlagged()));
+
+        byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
+        embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true);
+    }
+
+    private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor)
+            throws TikaException {
+        int numberOfAttachments = email.getNumberOfAttachments();
+        for (int i = 0; i < numberOfAttachments; i++) {
+            File tempFile = null;
+            try {
+                PSTAttachment attach = email.getAttachment(i);
+
+                // Get the filename; both long and short filenames can be used for attachments
+                String filename = attach.getLongFilename();
+                if (filename.isEmpty()) {
+                    filename = attach.getFilename();
+                }
+
+                xhtml.element("p", filename);
+
+                Metadata attachMeta = new Metadata();
+                attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename);
+                attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename);
+                AttributesImpl attributes = new AttributesImpl();
+                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+                attributes.addAttribute("", "id", "id", "CDATA", filename);
+                xhtml.startElement("div", attributes);
+                if (embeddedExtractor.shouldParseEmbedded(attachMeta)) {
+                    TemporaryResources tmp = new TemporaryResources();
+                    try {
+                        TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp);
+                        embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true);
+                    } finally {
+                        tmp.dispose();
+                    }
+                }
+                xhtml.endElement("div");
+
+            } catch (Exception e) {
+                throw new TikaException("Unable to unpack document stream", e);
+            } finally {
+                if (tempFile != null)
+                    tempFile.delete();
+            }
+        }
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java?rev=1723820&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java Sat Jan  9 02:03:05 2016
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+import java.util.Map;
+
+import org.apache.tika.detect.TypeDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class MboxParserTest {
+
+    protected ParseContext recursingContext;
+    private Parser autoDetectParser;
+    private TypeDetector typeDetector;
+    private MboxParser mboxParser;
+
+    private static InputStream getStream(String name) {
+        return MboxParserTest.class.getClass().getResourceAsStream(name);
+    }
+
+    @Before
+    public void setUp() throws Exception {
+        typeDetector = new TypeDetector();
+        autoDetectParser = new AutoDetectParser(typeDetector);
+        recursingContext = new ParseContext();
+        recursingContext.set(Parser.class, autoDetectParser);
+
+        mboxParser = new MboxParser();
+        mboxParser.setTracking(true);
+    }
+
+    @Test
+    public void testSimple() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = getStream("/test-documents/simple.mbox")) {
+            mboxParser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        String content = handler.toString();
+        assertContains("Test content 1", content);
+        assertContains("Test content 2", content);
+        assertEquals("application/mbox", metadata.get(Metadata.CONTENT_TYPE));
+
+        Map<Integer, Metadata> mailsMetadata = mboxParser.getTrackingMetadata();
+        assertEquals("Nb. Of mails", 2, mailsMetadata.size());
+
+        Metadata mail1 = mailsMetadata.get(0);
+        assertEquals("message/rfc822", mail1.get(Metadata.CONTENT_TYPE));
+        assertEquals("envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009", mail1.get("MboxParser-from"));
+
+        Metadata mail2 = mailsMetadata.get(1);
+        assertEquals("message/rfc822", mail2.get(Metadata.CONTENT_TYPE));
+        assertEquals("envelope-sender-mailbox-name Mon Jun 01 11:00:00 2010", mail2.get("MboxParser-from"));
+    }
+
+    @Test
+    public void testHeaders() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = getStream("/test-documents/headers.mbox")) {
+            mboxParser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        assertContains("Test content", handler.toString());
+        assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
+
+        Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
+
+        assertEquals("2009-06-10T03:58:45Z", mailMetadata.get(TikaCoreProperties.CREATED));
+        assertEquals("<au...@domain.com>", mailMetadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("subject", mailMetadata.get(Metadata.SUBJECT));
+        assertEquals("<au...@domain.com>", mailMetadata.get(Metadata.AUTHOR));
+        assertEquals("message/rfc822", mailMetadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("author@domain.com", mailMetadata.get("Message-From"));
+        assertEquals("<na...@domain.com>", mailMetadata.get("MboxParser-return-path"));
+    }
+
+    @Test
+    public void testMultilineHeader() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = getStream("/test-documents/multiline.mbox")) {
+            mboxParser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
+
+        Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
+        assertEquals("from xxx by xxx with xxx; date", mailMetadata.get("MboxParser-received"));
+    }
+
+    @Test
+    public void testQuoted() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = getStream("/test-documents/quoted.mbox")) {
+            mboxParser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        assertContains("Test content", handler.toString());
+        assertContains("> quoted stuff", handler.toString());
+    }
+
+    @Test
+    public void testComplex() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = getStream("/test-documents/complex.mbox")) {
+            mboxParser.parse(stream, handler, metadata, recursingContext);
+        }
+
+        assertEquals("Nb. Of mails", 3, mboxParser.getTrackingMetadata().size());
+
+        Metadata firstMail = mboxParser.getTrackingMetadata().get(0);
+        assertEquals("Re: question about when shuffle/sort start working", firstMail.get(Metadata.SUBJECT));
+        assertEquals("Re: question about when shuffle/sort start working", firstMail.get(TikaCoreProperties.TITLE));
+        assertEquals("Jothi Padmanabhan <jo...@yahoo-inc.com>", firstMail.get(Metadata.AUTHOR));
+        assertEquals("Jothi Padmanabhan <jo...@yahoo-inc.com>", firstMail.get(TikaCoreProperties.CREATOR));
+        assertEquals("core-user@hadoop.apache.org", firstMail.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
+
+        assertContains("When a Mapper completes", handler.toString());
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java?rev=1723820&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java Sat Jan  9 02:03:05 2016
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.ToHTMLContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class OutlookPSTParserTest extends TikaTest {
+
+  private Parser parser = new OutlookPSTParser();
+
+  @Test
+  public void testAccept() throws Exception {
+    assertTrue((parser.getSupportedTypes(null).contains(MediaType.application("vnd.ms-outlook-pst"))));
+  }
+
+  @Test
+  public void testParse() throws Exception {
+    Parser pstParser = new AutoDetectParser();
+    Metadata metadata = new Metadata();
+    ContentHandler handler = new ToHTMLContentHandler();
+
+    ParseContext context = new ParseContext();
+    EmbeddedTrackingExtrator trackingExtrator = new EmbeddedTrackingExtrator(context);
+    context.set(EmbeddedDocumentExtractor.class, trackingExtrator);
+    context.set(Parser.class, new AutoDetectParser());
+
+    pstParser.parse(getResourceAsStream("/test-documents/testPST.pst"), handler, metadata, context);
+
+    String output = handler.toString();
+
+    assertFalse(output.isEmpty());
+    assertTrue(output.contains("<meta name=\"Content-Length\" content=\"271360\">"));
+    assertTrue(output.contains("<meta name=\"Content-Type\" content=\"application/vnd.ms-outlook-pst\">"));
+
+    assertTrue(output.contains("<body><div class=\"email-folder\"><h1>"));
+    assertTrue(output.contains("<div class=\"embedded\" id=\"&lt;530D9CAC.5080901@gmail.com&gt;\"><h1>Re: Feature Generators</h1>"));
+    assertTrue(output.contains("<div class=\"embedded\" id=\"&lt;1393363252.28814.YahooMailNeo@web140906.mail.bf1.yahoo.com&gt;\"><h1>Re: init tokenizer fails: \"Bad type in putfield/putstatic\"</h1>"));
+    assertTrue(output.contains("Gary Murphy commented on TIKA-1250:"));
+
+    assertTrue(output.contains("<div class=\"email-folder\"><h1>Racine (pour la recherche)</h1>"));
+
+
+    List<Metadata> metaList = trackingExtrator.trackingMetadata;
+    assertEquals(6, metaList.size());
+
+    Metadata firstMail = metaList.get(0);
+    assertEquals("Jörn Kottmann", firstMail.get(TikaCoreProperties.CREATOR));
+    assertEquals("Re: Feature Generators", firstMail.get(TikaCoreProperties.TITLE));
+    assertEquals("kottmann@gmail.com", firstMail.get("senderEmailAddress"));
+    assertEquals("users@opennlp.apache.org", firstMail.get("displayTo"));
+    assertEquals("", firstMail.get("displayCC"));
+    assertEquals("", firstMail.get("displayBCC"));
+  }
+
+
+  private class EmbeddedTrackingExtrator extends ParsingEmbeddedDocumentExtractor {
+    List<Metadata> trackingMetadata = new ArrayList<Metadata>();
+
+    public EmbeddedTrackingExtrator(ParseContext context) {
+      super(context);
+    }
+
+    @Override
+    public boolean shouldParseEmbedded(Metadata metadata) {
+      return true;
+    }
+
+    @Override
+    public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
+      this.trackingMetadata.add(metadata);
+      super.parseEmbedded(stream, handler, metadata, outputHtml);
+    }
+
+  }
+}

Modified: tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml?rev=1723820&r1=1723819&r2=1723820&view=diff
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml (original)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml Sat Jan  9 02:03:05 2016
@@ -67,11 +67,6 @@
       <version>${mime4j.version}</version>
     </dependency>
     <dependency>
-      <groupId>com.pff</groupId>
-      <artifactId>java-libpst</artifactId>
-      <version>0.8.1</version>
-    </dependency>
-    <dependency>
       <groupId>commons-io</groupId>
       <artifactId>commons-io</artifactId>
       <version>${commons.io.version}</version>