You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/09 03:03:06 UTC
svn commit: r1723820 - in /tika/branches/2.x/tika-parser-modules:
tika-office-module/
tika-office-module/src/main/java/org/apache/tika/parser/mbox/
tika-office-module/src/test/java/org/apache/tika/parser/mbox/
tika-web-module/ tika-web-module/src/main/...
Author: bob
Date: Sat Jan 9 02:03:05 2016
New Revision: 1723820
URL: http://svn.apache.org/viewvc?rev=1723820&view=rev
Log:
TIKA-1824 - Move mbox to office.
Added:
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/
tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
Removed:
tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/mbox/
tika/branches/2.x/tika-parser-modules/tika-web-module/src/test/java/org/apache/tika/parser/mbox/
Modified:
tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml
tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml
Modified: tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml?rev=1723820&r1=1723819&r2=1723820&view=diff
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml (original)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/pom.xml Sat Jan 9 02:03:05 2016
@@ -73,6 +73,11 @@
<version>2.1.1</version>
</dependency>
<dependency>
+ <groupId>com.pff</groupId>
+ <artifactId>java-libpst</artifactId>
+ <version>0.8.1</version>
+ </dependency>
+ <dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-package-module</artifactId>
<version>${project.version}</version>
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=1723820&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java Sat Jan 9 02:03:05 2016
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Queue;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Mbox (mailbox) parser. This version extracts each mail from Mbox and uses the
+ * DelegatingParser to process each mail.
+ */
+public class MboxParser extends AbstractParser {
+
+ public static final String MBOX_MIME_TYPE = "application/mbox";
+ public static final String MBOX_RECORD_DIVIDER = "From ";
+ public static final int MAIL_MAX_SIZE = 50000000;
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -1762689436731160661L;
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("mbox"));
+ private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)");
+ private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>");
+
+ private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-";
+ private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from";
+ private final Map<Integer, Metadata> trackingMetadata = new HashMap<Integer, Metadata>();
+ private boolean tracking = false;
+
+ public static Date parseDate(String headerContent) throws ParseException {
+ SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
+ return dateFormat.parse(headerContent);
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, TikaException, SAXException {
+
+ EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class,
+ new ParsingEmbeddedDocumentExtractor(context));
+
+ String charsetName = "windows-1252";
+
+ metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
+ metadata.set(Metadata.CONTENT_ENCODING, charsetName);
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ InputStreamReader isr = new InputStreamReader(stream, charsetName);
+ try (BufferedReader reader = new BufferedReader(isr)) {
+ String curLine = reader.readLine();
+ int mailItem = 0;
+ do {
+ if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
+ Metadata mailMetadata = new Metadata();
+ Queue<String> multiline = new LinkedList<String>();
+ mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
+ mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
+ curLine = reader.readLine();
+
+ ByteArrayOutputStream message = new ByteArrayOutputStream(100000);
+ do {
+ if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
+ String latestLine = multiline.poll();
+ latestLine += " " + curLine.trim();
+ multiline.add(latestLine);
+ } else {
+ multiline.add(curLine);
+ }
+
+ message.write(curLine.getBytes(charsetName));
+ message.write(0x0A);
+ curLine = reader.readLine();
+ }
+ while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);
+
+ for (String item : multiline) {
+ saveHeaderInMetadata(mailMetadata, item);
+ }
+
+ ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray());
+ message = null;
+
+ if (extractor.shouldParseEmbedded(mailMetadata)) {
+ extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true);
+ }
+
+ if (tracking) {
+ getTrackingMetadata().put(mailItem++, mailMetadata);
+ }
+ } else {
+ curLine = reader.readLine();
+ }
+
+ } while (curLine != null && !Thread.currentThread().isInterrupted());
+ }
+
+ xhtml.endDocument();
+ }
+
+ public boolean isTracking() {
+ return tracking;
+ }
+
+ public void setTracking(boolean tracking) {
+ this.tracking = tracking;
+ }
+
+ public Map<Integer, Metadata> getTrackingMetadata() {
+ return trackingMetadata;
+ }
+
+ private void saveHeaderInMetadata(Metadata metadata, String curLine) {
+ Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
+ if (!headerMatcher.matches()) {
+ return; // ignore malformed header lines
+ }
+
+ String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT);
+ String headerContent = headerMatcher.group(2);
+
+ if (headerTag.equalsIgnoreCase("From")) {
+ metadata.set(TikaCoreProperties.CREATOR, headerContent);
+ } else if (headerTag.equalsIgnoreCase("To") || headerTag.equalsIgnoreCase("Cc")
+ || headerTag.equalsIgnoreCase("Bcc")) {
+ Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent);
+ if (address.find()) {
+ metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1));
+ } else if (headerContent.indexOf('@') > -1) {
+ metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent);
+ }
+
+ String property = Metadata.MESSAGE_TO;
+ if (headerTag.equalsIgnoreCase("Cc")) {
+ property = Metadata.MESSAGE_CC;
+ } else if (headerTag.equalsIgnoreCase("Bcc")) {
+ property = Metadata.MESSAGE_BCC;
+ }
+ metadata.add(property, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Subject")) {
+ metadata.add(Metadata.SUBJECT, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Date")) {
+ try {
+ Date date = parseDate(headerContent);
+ metadata.set(TikaCoreProperties.CREATED, date);
+ } catch (ParseException e) {
+ // ignoring date because format was not understood
+ }
+ } else if (headerTag.equalsIgnoreCase("Message-Id")) {
+ metadata.set(TikaCoreProperties.IDENTIFIER, headerContent);
+ } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
+ metadata.set(TikaCoreProperties.RELATION, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Content-Type")) {
+ // TODO - key off content-type in headers to
+ // set mapping to use for content and convert if necessary.
+
+ metadata.add(Metadata.CONTENT_TYPE, headerContent);
+ metadata.set(TikaCoreProperties.FORMAT, headerContent);
+ } else {
+ metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java?rev=1723820&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java Sat Jan 9 02:03:05 2016
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static java.lang.String.valueOf;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.util.Collections.singleton;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+
+import com.pff.PSTAttachment;
+import com.pff.PSTFile;
+import com.pff.PSTFolder;
+import com.pff.PSTMessage;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Parser for MS Outlook PST email storage files
+ */
+public class OutlookPSTParser extends AbstractParser {
+
+ private static final long serialVersionUID = 620998217748364063L;
+
+ public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst");
+ private static final Set<MediaType> SUPPORTED_TYPES = singleton(MS_OUTLOOK_PST_MIMETYPE);
+
+ private static AttributesImpl createAttribute(String attName, String attValue) {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", attName, attName, "CDATA", attValue);
+ return attributes;
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ // Use the delegate parser to parse the contained document
+ EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class,
+ new ParsingEmbeddedDocumentExtractor(context));
+
+ metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString());
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ TikaInputStream in = TikaInputStream.get(stream);
+ PSTFile pstFile = null;
+ try {
+ pstFile = new PSTFile(in.getFile().getPath());
+ metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length()));
+ boolean isValid = pstFile.getFileHandle().getFD().valid();
+ metadata.set("isValid", valueOf(isValid));
+ if (isValid) {
+ parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
+ }
+ } catch (Exception e) {
+ throw new TikaException(e.getMessage(), e);
+ } finally {
+ if (pstFile != null && pstFile.getFileHandle() != null) {
+ try {
+ pstFile.getFileHandle().close();
+ } catch (IOException e) {
+ //swallow closing exception
+ }
+ }
+ }
+
+ xhtml.endDocument();
+ }
+
+ private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, EmbeddedDocumentExtractor embeddedExtractor)
+ throws Exception {
+ if (pstFolder.getContentCount() > 0) {
+ PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
+ while (pstMail != null) {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
+ handler.startElement("div", attributes);
+ handler.element("h1", pstMail.getSubject());
+
+ parserMailItem(handler, pstMail, embeddedExtractor);
+ parseMailAttachments(handler, pstMail, embeddedExtractor);
+
+ handler.endElement("div");
+
+ pstMail = (PSTMessage) pstFolder.getNextChild();
+ }
+ }
+
+ if (pstFolder.hasSubfolders()) {
+ for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) {
+ handler.startElement("div", createAttribute("class", "email-folder"));
+ handler.element("h1", pstSubFolder.getDisplayName());
+ parseFolder(handler, pstSubFolder, embeddedExtractor);
+ handler.endElement("div");
+ }
+ }
+ }
+
+ private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
+ Metadata mailMetadata = new Metadata();
+ mailMetadata.set(Metadata.RESOURCE_NAME_KEY, pstMail.getInternetMessageId());
+ mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId());
+ mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId());
+ mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject());
+ mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName());
+ mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName());
+ mailMetadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime());
+ mailMetadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime());
+ mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment());
+ mailMetadata.set("descriptorNodeId", valueOf(pstMail.getDescriptorNodeId()));
+ mailMetadata.set("senderEmailAddress", pstMail.getSenderEmailAddress());
+ mailMetadata.set("recipients", pstMail.getRecipientsString());
+ mailMetadata.set("displayTo", pstMail.getDisplayTo());
+ mailMetadata.set("displayCC", pstMail.getDisplayCC());
+ mailMetadata.set("displayBCC", pstMail.getDisplayBCC());
+ mailMetadata.set("importance", valueOf(pstMail.getImportance()));
+ mailMetadata.set("priority", valueOf(pstMail.getPriority()));
+ mailMetadata.set("flagged", valueOf(pstMail.isFlagged()));
+
+ byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
+ embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true);
+ }
+
+ private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor)
+ throws TikaException {
+ int numberOfAttachments = email.getNumberOfAttachments();
+ for (int i = 0; i < numberOfAttachments; i++) {
+ File tempFile = null;
+ try {
+ PSTAttachment attach = email.getAttachment(i);
+
+ // Get the filename; both long and short filenames can be used for attachments
+ String filename = attach.getLongFilename();
+ if (filename.isEmpty()) {
+ filename = attach.getFilename();
+ }
+
+ xhtml.element("p", filename);
+
+ Metadata attachMeta = new Metadata();
+ attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename);
+ attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename);
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", filename);
+ xhtml.startElement("div", attributes);
+ if (embeddedExtractor.shouldParseEmbedded(attachMeta)) {
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp);
+ embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true);
+ } finally {
+ tmp.dispose();
+ }
+ }
+ xhtml.endElement("div");
+
+ } catch (Exception e) {
+ throw new TikaException("Unable to unpack document stream", e);
+ } finally {
+ if (tempFile != null)
+ tempFile.delete();
+ }
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java?rev=1723820&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java Sat Jan 9 02:03:05 2016
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+import java.util.Map;
+
+import org.apache.tika.detect.TypeDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class MboxParserTest {
+
+ protected ParseContext recursingContext;
+ private Parser autoDetectParser;
+ private TypeDetector typeDetector;
+ private MboxParser mboxParser;
+
+ private static InputStream getStream(String name) {
+ return MboxParserTest.class.getClass().getResourceAsStream(name);
+ }
+
+ @Before
+ public void setUp() throws Exception {
+ typeDetector = new TypeDetector();
+ autoDetectParser = new AutoDetectParser(typeDetector);
+ recursingContext = new ParseContext();
+ recursingContext.set(Parser.class, autoDetectParser);
+
+ mboxParser = new MboxParser();
+ mboxParser.setTracking(true);
+ }
+
+ @Test
+ public void testSimple() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = getStream("/test-documents/simple.mbox")) {
+ mboxParser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ String content = handler.toString();
+ assertContains("Test content 1", content);
+ assertContains("Test content 2", content);
+ assertEquals("application/mbox", metadata.get(Metadata.CONTENT_TYPE));
+
+ Map<Integer, Metadata> mailsMetadata = mboxParser.getTrackingMetadata();
+ assertEquals("Nb. Of mails", 2, mailsMetadata.size());
+
+ Metadata mail1 = mailsMetadata.get(0);
+ assertEquals("message/rfc822", mail1.get(Metadata.CONTENT_TYPE));
+ assertEquals("envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009", mail1.get("MboxParser-from"));
+
+ Metadata mail2 = mailsMetadata.get(1);
+ assertEquals("message/rfc822", mail2.get(Metadata.CONTENT_TYPE));
+ assertEquals("envelope-sender-mailbox-name Mon Jun 01 11:00:00 2010", mail2.get("MboxParser-from"));
+ }
+
+ @Test
+ public void testHeaders() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = getStream("/test-documents/headers.mbox")) {
+ mboxParser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertContains("Test content", handler.toString());
+ assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
+
+ Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
+
+ assertEquals("2009-06-10T03:58:45Z", mailMetadata.get(TikaCoreProperties.CREATED));
+ assertEquals("<au...@domain.com>", mailMetadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("subject", mailMetadata.get(Metadata.SUBJECT));
+ assertEquals("<au...@domain.com>", mailMetadata.get(Metadata.AUTHOR));
+ assertEquals("message/rfc822", mailMetadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("author@domain.com", mailMetadata.get("Message-From"));
+ assertEquals("<na...@domain.com>", mailMetadata.get("MboxParser-return-path"));
+ }
+
+ @Test
+ public void testMultilineHeader() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = getStream("/test-documents/multiline.mbox")) {
+ mboxParser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size());
+
+ Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0);
+ assertEquals("from xxx by xxx with xxx; date", mailMetadata.get("MboxParser-received"));
+ }
+
+ @Test
+ public void testQuoted() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = getStream("/test-documents/quoted.mbox")) {
+ mboxParser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertContains("Test content", handler.toString());
+ assertContains("> quoted stuff", handler.toString());
+ }
+
+ @Test
+ public void testComplex() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = getStream("/test-documents/complex.mbox")) {
+ mboxParser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("Nb. Of mails", 3, mboxParser.getTrackingMetadata().size());
+
+ Metadata firstMail = mboxParser.getTrackingMetadata().get(0);
+ assertEquals("Re: question about when shuffle/sort start working", firstMail.get(Metadata.SUBJECT));
+ assertEquals("Re: question about when shuffle/sort start working", firstMail.get(TikaCoreProperties.TITLE));
+ assertEquals("Jothi Padmanabhan <jo...@yahoo-inc.com>", firstMail.get(Metadata.AUTHOR));
+ assertEquals("Jothi Padmanabhan <jo...@yahoo-inc.com>", firstMail.get(TikaCoreProperties.CREATOR));
+ assertEquals("core-user@hadoop.apache.org", firstMail.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
+
+ assertContains("When a Mapper completes", handler.toString());
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java?rev=1723820&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/test/java/org/apache/tika/parser/mbox/OutlookPSTParserTest.java Sat Jan 9 02:03:05 2016
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.ToHTMLContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class OutlookPSTParserTest extends TikaTest {
+
+ private Parser parser = new OutlookPSTParser();
+
+ @Test
+ public void testAccept() throws Exception {
+ assertTrue((parser.getSupportedTypes(null).contains(MediaType.application("vnd.ms-outlook-pst"))));
+ }
+
+ @Test
+ public void testParse() throws Exception {
+ Parser pstParser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new ToHTMLContentHandler();
+
+ ParseContext context = new ParseContext();
+ EmbeddedTrackingExtrator trackingExtrator = new EmbeddedTrackingExtrator(context);
+ context.set(EmbeddedDocumentExtractor.class, trackingExtrator);
+ context.set(Parser.class, new AutoDetectParser());
+
+ pstParser.parse(getResourceAsStream("/test-documents/testPST.pst"), handler, metadata, context);
+
+ String output = handler.toString();
+
+ assertFalse(output.isEmpty());
+ assertTrue(output.contains("<meta name=\"Content-Length\" content=\"271360\">"));
+ assertTrue(output.contains("<meta name=\"Content-Type\" content=\"application/vnd.ms-outlook-pst\">"));
+
+ assertTrue(output.contains("<body><div class=\"email-folder\"><h1>"));
+ assertTrue(output.contains("<div class=\"embedded\" id=\"<530D9CAC.5080901@gmail.com>\"><h1>Re: Feature Generators</h1>"));
+ assertTrue(output.contains("<div class=\"embedded\" id=\"<1393363252.28814.YahooMailNeo@web140906.mail.bf1.yahoo.com>\"><h1>Re: init tokenizer fails: \"Bad type in putfield/putstatic\"</h1>"));
+ assertTrue(output.contains("Gary Murphy commented on TIKA-1250:"));
+
+ assertTrue(output.contains("<div class=\"email-folder\"><h1>Racine (pour la recherche)</h1>"));
+
+
+ List<Metadata> metaList = trackingExtrator.trackingMetadata;
+ assertEquals(6, metaList.size());
+
+ Metadata firstMail = metaList.get(0);
+ assertEquals("Jörn Kottmann", firstMail.get(TikaCoreProperties.CREATOR));
+ assertEquals("Re: Feature Generators", firstMail.get(TikaCoreProperties.TITLE));
+ assertEquals("kottmann@gmail.com", firstMail.get("senderEmailAddress"));
+ assertEquals("users@opennlp.apache.org", firstMail.get("displayTo"));
+ assertEquals("", firstMail.get("displayCC"));
+ assertEquals("", firstMail.get("displayBCC"));
+ }
+
+
+ private class EmbeddedTrackingExtrator extends ParsingEmbeddedDocumentExtractor {
+ List<Metadata> trackingMetadata = new ArrayList<Metadata>();
+
+ public EmbeddedTrackingExtrator(ParseContext context) {
+ super(context);
+ }
+
+ @Override
+ public boolean shouldParseEmbedded(Metadata metadata) {
+ return true;
+ }
+
+ @Override
+ public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
+ this.trackingMetadata.add(metadata);
+ super.parseEmbedded(stream, handler, metadata, outputHtml);
+ }
+
+ }
+}
Modified: tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml?rev=1723820&r1=1723819&r2=1723820&view=diff
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml (original)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml Sat Jan 9 02:03:05 2016
@@ -67,11 +67,6 @@
<version>${mime4j.version}</version>
</dependency>
<dependency>
- <groupId>com.pff</groupId>
- <artifactId>java-libpst</artifactId>
- <version>0.8.1</version>
- </dependency>
- <dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>${commons.io.version}</version>