You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/10/17 18:10:58 UTC

tika git commit: TIKA-2122 : add all headers from MSG and RFC822 files

Repository: tika
Updated Branches:
  refs/heads/master bfd1d9139 -> 8e819c3ca


TIKA-2122 : add all headers from MSG and RFC822 files


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/8e819c3c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/8e819c3c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/8e819c3c

Branch: refs/heads/master
Commit: 8e819c3caf3ff3b0492f600b4193d1b3ee74f51b
Parents: bfd1d91
Author: tballison <ta...@mitre.org>
Authored: Mon Oct 17 14:10:46 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Oct 17 14:10:46 2016 -0400

----------------------------------------------------------------------
 .../java/org/apache/tika/metadata/Message.java  |  6 ++
 .../src/test/java/org/apache/tika/TikaTest.java |  8 ++
 .../tika/parser/mail/MailContentHandler.java    |  5 ++
 .../tika/parser/microsoft/OutlookExtractor.java | 87 +++++++++++++++++++-
 .../tika/parser/mail/RFC822ParserTest.java      |  1 +
 .../parser/microsoft/OutlookParserTest.java     | 15 ++++
 6 files changed, 121 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/8e819c3c/tika-core/src/main/java/org/apache/tika/metadata/Message.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Message.java b/tika-core/src/main/java/org/apache/tika/metadata/Message.java
index ffb9413..dad3952 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Message.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Message.java
@@ -16,10 +16,16 @@
  */
 package org.apache.tika.metadata;
 
+import org.apache.tika.Tika;
+
 /**
  * A collection of Message related property names.
  */
 public interface Message {
+    String MESSAGE_PREFIX = "Message"+ Metadata.NAMESPACE_PREFIX_DELIMITER;
+
+    String MESSAGE_RAW_HEADER_PREFIX = MESSAGE_PREFIX+"Raw-Header"+Metadata.NAMESPACE_PREFIX_DELIMITER;
+
     String MESSAGE_RECIPIENT_ADDRESS = "Message-Recipient-Address";
     
     String MESSAGE_FROM = "Message-From";

http://git-wip-us.apache.org/repos/asf/tika/blob/8e819c3c/tika-core/src/test/java/org/apache/tika/TikaTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 690db33..0bc5a83 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -296,4 +296,12 @@ public abstract class TikaTest {
             i++;
         }
     }
+
+    public static void debug(Metadata metadata) {
+        for (String n : metadata.names()) {
+            for (String v : metadata.getValues(n)) {
+                System.out.println(n + " : "+v);
+            }
+        }
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/8e819c3c/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 6a9bc1b..60170e6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -51,6 +51,7 @@ import org.apache.tika.config.TikaConfig;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Message;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
@@ -238,6 +239,7 @@ class MailContentHandler implements ContentHandler {
 
         try {
             String fieldname = field.getName();
+
             ParsedField parsedField = LenientFieldParser.getParser().parse(
                     field, DecodeMonitor.SILENT);
             if (fieldname.equalsIgnoreCase("From")) {
@@ -276,6 +278,9 @@ class MailContentHandler implements ContentHandler {
                     date = tryOtherDateFormats(field.getBody());
                 }
                 metadata.set(TikaCoreProperties.CREATED, date);
+            } else {
+                metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX+parsedField.getName(),
+                        field.getBody());
             }
         } catch (RuntimeException me) {
             if (strictParsing) {

http://git-wip-us.apache.org/repos/asf/tika/blob/8e819c3c/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index c1db274..76ac17f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -25,13 +25,19 @@ import java.nio.charset.Charset;
 import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.UnsupportedCharsetException;
 import java.text.ParseException;
+import java.util.ArrayList;
 import java.util.Date;
+import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import org.apache.james.mime4j.codec.DecodeMonitor;
+import org.apache.james.mime4j.codec.DecoderUtil;
+import org.apache.james.mime4j.dom.field.ParsedField;
+import org.apache.james.mime4j.field.LenientFieldParser;
 import org.apache.poi.hmef.attribute.MAPIRtfAttribute;
 import org.apache.poi.hsmf.MAPIMessage;
 import org.apache.poi.hsmf.datatypes.AttachmentChunks;
@@ -66,6 +72,18 @@ import org.xml.sax.SAXException;
  * Outlook Message Parser.
  */
 public class OutlookExtractor extends AbstractPOIFSExtractor {
+
+
+    private static Pattern HEADER_KEY_PAT =
+            Pattern.compile("\\A([\\x21-\\x39\\x3B-\\x7E]+):(.*?)\\Z");
+    //this according to the spec; in practice, it is probably more likely
+    //that a "split field" fails to start with a space character than
+    //that a real header contains anything but [-_A-Za-z0-9].
+    //e.g.
+    //header: this header goes onto the next line
+    //<mailto:xyz@cnn.com...
+
+
     private static final Metadata EMPTY_METADATA = new Metadata();
     HtmlEncodingDetector detector = new HtmlEncodingDetector();
 
@@ -118,8 +136,19 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
                 }
             } catch (ChunkNotFoundException he) {
             } // Will be fixed in POI 3.7 Final
+            try {
+                Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
+                for (Map.Entry<String, String[]> e : headers.entrySet()) {
+                    String headerKey = e.getKey();
+                    for (String headerValue : e.getValue()) {
+                        metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX+headerKey, headerValue);
+                    }
+                }
+            } catch (ChunkNotFoundException e) {
+
+            }
 
-            // Date - try two ways to find it
+                    // Date - try two ways to find it
             // First try via the proper chunk
             if (msg.getMessageDate() != null) {
                 metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
@@ -264,6 +293,62 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
         }
     }
 
+    //As of 3.15, POI currently returns header[] by splitting on /\r?\n/
+    //this rebuilds headers that are broken up over several lines
+    //this also decodes encoded headers.
+    private Map<String, String[]> normalizeHeaders(String[] rows) {
+        Map<String, String[]> ret = new LinkedHashMap<>();
+        if (rows == null) {
+            return ret;
+        }
+        StringBuilder sb = new StringBuilder();
+        Map<String, List<String>> headers = new LinkedHashMap();
+        Matcher headerKeyMatcher = HEADER_KEY_PAT.matcher("");
+        String lastKey = null;
+        int consec = 0;
+        for (String row : rows) {
+            headerKeyMatcher.reset(row);
+            if (headerKeyMatcher.find()) {
+                if (lastKey != null) {
+                    List<String> vals = headers.get(lastKey);
+                    vals = (vals == null) ? new ArrayList<String>() : vals;
+                    vals.add(decodeHeader(sb.toString()));
+                    headers.put(lastKey, vals);
+                }
+                //reset sb
+                sb.setLength(0);
+                lastKey = headerKeyMatcher.group(1).trim();
+                sb.append(headerKeyMatcher.group(2).trim());
+                consec = 0;
+            } else {
+                if (consec > 0) {
+                    sb.append("\n");
+                }
+                sb.append(row);
+            }
+            consec++;
+        }
+
+        //make sure to add the last value
+        if (sb.length() > 0 && lastKey != null) {
+            List<String> vals = headers.get(lastKey);
+            vals = (vals == null) ? new ArrayList<String>() : vals;
+            vals.add(decodeHeader(sb.toString()));
+            headers.put(lastKey, vals);
+        }
+
+        //convert to array
+        for (Map.Entry<String, List<String>> e : headers.entrySet()) {
+            ret.put(e.getKey(), e.getValue().toArray(new String[e.getValue().size()]));
+        }
+        return ret;
+
+    }
+
+    private String decodeHeader(String header) {
+        return DecoderUtil.decodeEncodedWords(header, DecodeMonitor.SILENT);
+    }
+
     private void header(XHTMLContentHandler xhtml, String key, String value)
             throws SAXException {
         if (value != null && value.length() > 0) {

http://git-wip-us.apache.org/repos/asf/tika/blob/8e819c3c/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index c7fcbfb..035b1c2 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -367,6 +367,7 @@ public class RFC822ParserTest extends TikaTest {
         assertContains("TEST DATA FOR TIKA.", handler.toString());
         assertContains("This is text inside an unencrypted zip file", handler.toString());
         assertContains("TIKA-1028", handler.toString());
+        assertEquals("<ju...@gmail.com>", metadata.get("Message:Raw-Header:Return-Path"));
     }
 
     /**

http://git-wip-us.apache.org/repos/asf/tika/blob/8e819c3c/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index 8662e65..c15308f 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -26,6 +26,7 @@ import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
 import java.io.InputStream;
 import java.io.StringWriter;
+import java.util.Arrays;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -71,6 +72,12 @@ public class OutlookParserTest extends TikaTest {
                 "L'\u00C9quipe Microsoft Outlook Express",
                 metadata.get(Metadata.AUTHOR));
 
+        //ensure that "raw" header is correctly decoded
+        assertEquals(
+                "L'\u00C9quipe Microsoft Outlook Express <ms...@microsoft.com>",
+                metadata.get(Metadata.MESSAGE_RAW_HEADER_PREFIX+"From"));
+
+
         // Stored as Thu, 5 Apr 2007 09:26:06 -0700
         assertEquals(
                 "2007-04-05T16:26:06Z",
@@ -108,6 +115,14 @@ public class OutlookParserTest extends TikaTest {
         Matcher matcher = pattern.matcher(content);
         assertTrue(matcher.find());
         assertFalse(matcher.find());
+
+        //test that last header is added
+        assertContains("29 Jan 2009 19:17:10.0163 (UTC) FILETIME=[2ED25E30:01C98246]",
+                Arrays.asList(metadata.getValues("Message:Raw-Header:X-OriginalArrivalTime")));
+        //confirm next line is added correctly
+        assertContains("from athena.apache.org (HELO athena.apache.org) (140.211.11.136)\n" +
+                "    by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 29 Jan 2009 11:17:08 -0800",
+                Arrays.asList(metadata.getValues("Message:Raw-Header:Received")));
     }
 
     /**