You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/01/19 11:26:41 UTC
[tika] branch main updated: TIKA-3957 (#910)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new ac9c0f899 TIKA-3957 (#910)
ac9c0f899 is described below
commit ac9c0f899a2226d2196ca67eca06d499bf9f8e74
Author: Tim Allison <ta...@apache.org>
AuthorDate: Thu Jan 19 06:26:33 2023 -0500
TIKA-3957 (#910)
* TIKA-3957 -- refactor date parsing out of the MailContentHandler and prefer thread-safe DateTimeFormatters over thread-unsafe SimpleDateFormats, add workarounds for at least 2 bugs in jdk8.
---
tika-parent/pom.xml | 2 +-
.../tika/parser/mailcommons/MailDateParser.java | 577 ++++++++++++++++++++-
.../parser/mailcommons/MailDateParserTest.java | 186 +++++++
.../tika/parser/mail/MailContentHandler.java | 125 +----
.../org/apache/tika/parser/mbox/MboxParser.java | 13 +-
.../apache/tika/parser/mail/RFC822ParserTest.java | 75 +--
.../tika/parser/microsoft/OutlookExtractor.java | 7 +-
7 files changed, 785 insertions(+), 200 deletions(-)
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 8c3cefed0..64e2a48ee 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -365,7 +365,7 @@
<lucene.version>8.11.2</lucene.version>
<metadata.extractor.version>2.18.0</metadata.extractor.version>
<microsoft.translator.version>0.6.2</microsoft.translator.version>
- <!-- 0.8.6 is built with java 11 and does not work with Java 8 -->
+ <!-- 0.8.5 is built with java 11 and does not work with Java 8 -->
<mime4j.version>0.8.4</mime4j.version>
<mockito.version>4.11.0</mockito.version>
<netcdf-java.version>4.5.5</netcdf-java.version>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-commons/src/main/java/org/apache/tika/parser/mailcommons/MailDateParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-commons/src/main/java/org/apache/tika/parser/mailcommons/MailDateParser.java
index 8ab470074..3a5220710 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-commons/src/main/java/org/apache/tika/parser/mailcommons/MailDateParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-commons/src/main/java/org/apache/tika/parser/mailcommons/MailDateParser.java
@@ -16,15 +16,584 @@
*/
package org.apache.tika.parser.mailcommons;
+import static java.time.ZoneOffset.UTC;
+import static java.time.temporal.ChronoField.AMPM_OF_DAY;
+import static java.time.temporal.ChronoField.DAY_OF_MONTH;
+import static java.time.temporal.ChronoField.DAY_OF_WEEK;
+import static java.time.temporal.ChronoField.HOUR_OF_AMPM;
+import static java.time.temporal.ChronoField.HOUR_OF_DAY;
+import static java.time.temporal.ChronoField.INSTANT_SECONDS;
+import static java.time.temporal.ChronoField.MILLI_OF_SECOND;
+import static java.time.temporal.ChronoField.MINUTE_OF_HOUR;
+import static java.time.temporal.ChronoField.MONTH_OF_YEAR;
+import static java.time.temporal.ChronoField.OFFSET_SECONDS;
+import static java.time.temporal.ChronoField.SECOND_OF_MINUTE;
+import static java.time.temporal.ChronoField.YEAR;
+import static org.apache.tika.utils.DateUtils.MIDDAY;
+
import java.text.ParseException;
-import java.text.SimpleDateFormat;
+import java.text.ParsePosition;
+import java.time.DateTimeException;
+import java.time.Instant;
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.time.ZonedDateTime;
+import java.time.format.DateTimeFormatter;
+import java.time.format.DateTimeFormatterBuilder;
+import java.time.format.DateTimeParseException;
+import java.time.format.ResolverStyle;
+import java.time.format.SignStyle;
+import java.time.temporal.ChronoField;
+import java.time.temporal.TemporalAccessor;
import java.util.Date;
+import java.util.HashMap;
import java.util.Locale;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.utils.StringUtils;
+/**
+ * Dates in emails are a mess. There are at least two major date related bugs in JDK 8.
+ * This class does its best to parse date strings. It does have a US-based date bias.
+ * Please open a ticket to fix this. We can also add overrides via the parser config
+ * to manage custom dates.
+ */
public class MailDateParser {
- public static Date parseDate(String headerContent) throws ParseException {
- SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
- return dateFormat.parse(headerContent);
+
+ //TIKA-1970 Mac Mail's format is GMT+1 so we need to check for hour only
+ //Also, there are numerous bugs in jdk 8 with localized offsets
+ //so we need to get rid of the GMT/UTC component (e.g. https://bugs.openjdk.org/browse/JDK-8154520)
+ private static final Pattern LOCALIZED_OFFSET_PATTERN =
+ Pattern.compile("(?:UTC|GMT)\\s*([-+])\\s*(\\d?\\d):?(\\d\\d)?\\Z");
+
+ //this is used to strip junk after a fairly full offset:
+ // Wed, 26 Jan 2022 09:14:37 +0100 (CET)
+ private static final Pattern OFFSET_PATTERN =
+ Pattern.compile("[-+]\\s*\\d?\\d:?\\d\\d");
+
+ private static final Pattern DAYS_OF_WEEK =
+ Pattern.compile("(?:\\A| )(MON|MONDAY|TUE|TUES|TUESDAY|WED|WEDNESDAY|THU|THUR|THURS" +
+ "|THURSDAY|FRI|FRIDAY|SAT|SATURDAY|SUN|SUNDAY) ");
+
+ //find a time ending in am/pm without a space: 10:30am and
+ //use this pattern to insert space: 10:30 am
+ private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b");
+
+ //Taken nearly directly from mime4j
+ private static Map<Long, String> monthOfYear() {
+ HashMap<Long, String> result = new HashMap<>();
+ result.put(1L, "JAN");
+ result.put(2L, "FEB");
+ result.put(3L, "MAR");
+ result.put(4L, "APR");
+ result.put(5L, "MAY");
+ result.put(6L, "JUN");
+ result.put(7L, "JUL");
+ result.put(8L, "AUG");
+ result.put(9L, "SEP");
+ result.put(10L, "OCT");
+ result.put(11L, "NOV");
+ result.put(12L, "DEC");
+ return result;
+ }
+
+ private static Map<Long, String> dayOfWeek() {
+ HashMap<Long, String> result = new HashMap<>();
+ result.put(1L, "MON");
+ result.put(2L, "TUE");
+ result.put(3L, "WED");
+ result.put(4L, "THU");
+ result.put(5L, "FRI");
+ result.put(6L, "SAT");
+ result.put(7L, "SUN");
+ return result;
}
+ private static final int INITIAL_YEAR = 1970;
+
+ private static final DateTimeFormatter TIME_ZONE_FORMATTER
+ = new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .optionalStart()
+ .appendLiteral(' ') //optional space before any of the time zone offset/ids
+ .optionalEnd()
+ .optionalStart()
+ .appendZoneId()
+ .optionalEnd()
+ .optionalStart()
+ .appendPattern("X")//localized zone offset, e.g. Z; -08; -0830; -08:30; -083015; -08:30:15
+ .optionalEnd()
+ .optionalStart()
+ .appendPattern("z")//zone name, e.g. PST
+ .optionalEnd().toFormatter(Locale.US);
+
+
+ public static final DateTimeFormatter RFC_5322 = new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .optionalStart()
+ .appendText(DAY_OF_WEEK, dayOfWeek())
+ .appendLiteral(", ")
+ .optionalEnd()
+ .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NOT_NEGATIVE)
+ .appendLiteral(' ')
+ .appendText(MONTH_OF_YEAR, monthOfYear())
+ .appendLiteral(' ')
+ .appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
+ .appendLiteral(' ')
+ .appendValue(HOUR_OF_DAY, 2)
+ .appendLiteral(':')
+ .appendValue(MINUTE_OF_HOUR, 2)
+ .optionalStart()
+ .appendLiteral(':')
+ .appendValue(SECOND_OF_MINUTE, 2)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral('.')
+ .appendValue(MILLI_OF_SECOND, 3)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral(' ')
+ .appendOffset("+HHMM", "GMT")
+ .optionalEnd()
+ .toFormatter(Locale.US)
+ //.withZone(ZoneId.of("GMT")) see TIKA-3735
+ .withResolverStyle(ResolverStyle.LENIENT)
+ .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_DAY, MINUTE_OF_HOUR,
+ SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
+
+ public static final DateTimeFormatter RFC_5322_LENIENT = new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
+ .appendLiteral(' ')
+ .appendPattern("MMM")
+ .appendLiteral(' ')
+ .appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
+ .appendLiteral(' ')
+ .appendValue(HOUR_OF_DAY, 1, 2, SignStyle.NEVER)
+ .appendLiteral(':')
+ .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
+ .optionalStart()
+ .appendLiteral(':')
+ .appendValue(SECOND_OF_MINUTE, 2)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral('.')
+ .appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
+ .optionalEnd()
+ .optionalStart()
+ .append(TIME_ZONE_FORMATTER)
+ .optionalEnd()
+ .toFormatter(Locale.US)
+ //.withZone(ZoneId.of("GMT")) see TIKA-3735
+ .withResolverStyle(ResolverStyle.LENIENT)
+ .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR,
+ HOUR_OF_DAY, MINUTE_OF_HOUR,
+ SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
+
+
+ //this differs only from RFC_5322_LENIENT in requiring am/pm
+ public static final DateTimeFormatter RFC_5322_AMPM_LENIENT = new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
+ .appendLiteral(' ')
+ .appendPattern("MMM")
+ .appendLiteral(' ')
+ .appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
+ .appendLiteral(' ')
+ .appendValue(ChronoField.HOUR_OF_AMPM, 1, 2, SignStyle.NEVER)
+ .appendLiteral(':')
+ .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
+ .optionalStart()
+ .appendLiteral(':')
+ .appendValue(SECOND_OF_MINUTE, 2)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral('.')
+ .appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral(' ') //optional space before am/pm
+ .optionalEnd()
+ .appendText(ChronoField.AMPM_OF_DAY)
+ .optionalStart()
+ .optionalStart()
+ .append(TIME_ZONE_FORMATTER)
+ .optionalEnd()
+ .toFormatter(Locale.US)
+ //.withZone(ZoneId.of("GMT")) see TIKA-3735
+ .withResolverStyle(ResolverStyle.LENIENT)
+ .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_AMPM, AMPM_OF_DAY,
+ MINUTE_OF_HOUR,
+ SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
+
+
+ public static final DateTimeFormatter MMM_D_YYYY_HH_MM_AM_PM = // "July 9 2012 10:10:10 am UTC"
+ new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendPattern("MMM")
+ .appendLiteral(' ')
+ .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
+ .appendLiteral(' ')
+ .appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
+ .appendLiteral(' ')
+ .appendValue(ChronoField.HOUR_OF_AMPM, 1, 2, SignStyle.NEVER)
+ .appendLiteral(':')
+ .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
+ .optionalStart()
+ .appendLiteral(':')
+ .appendValue(SECOND_OF_MINUTE, 2)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral('.')
+ .appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral(' ') //optional space before am/pm
+ .optionalEnd()
+ .appendText(ChronoField.AMPM_OF_DAY)
+ .optionalStart()
+ .append(TIME_ZONE_FORMATTER)
+ .optionalEnd()
+ .toFormatter(Locale.US)
+ //.withZone(ZoneId.of("GMT")) see TIKA-3735
+ .withResolverStyle(ResolverStyle.LENIENT)
+ .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_AMPM, AMPM_OF_DAY,
+ MINUTE_OF_HOUR,
+ SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
+
+ public static final DateTimeFormatter MMM_D_YYYY_HH_MM = // "July 9 2012 10:10:10 UTC"
+ new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendPattern("MMM")
+ .appendLiteral(' ')
+ .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
+ .appendLiteral(' ')
+ .appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
+ .appendLiteral(' ')
+ .appendValue(HOUR_OF_DAY, 1, 2, SignStyle.NEVER)
+ .appendLiteral(':')
+ .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
+ .optionalStart()
+ .appendLiteral(':')
+ .appendValue(SECOND_OF_MINUTE, 2)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral('.')
+ .appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
+ .optionalEnd()
+ .optionalStart()
+ .optionalStart()
+ .append(TIME_ZONE_FORMATTER)
+ .optionalEnd()
+ .toFormatter(Locale.US)
+ //.withZone(ZoneId.of("GMT")) see TIKA-3735
+ .withResolverStyle(ResolverStyle.LENIENT)
+ .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_DAY,
+ MINUTE_OF_HOUR,
+ SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
+
+ public static final DateTimeFormatter MM_SLASH_DD_SLASH_YY_HH_MM = //
+ // US-based month/day ordering !!!! e.g. 7/9/2012 10:10:10"
+ new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendValue(MONTH_OF_YEAR, 1, 2, SignStyle.NEVER)
+ .appendLiteral('/')
+ .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
+ .appendLiteral('/')
+ .appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
+ .appendLiteral(' ')
+ .appendValue(HOUR_OF_DAY, 1, 2, SignStyle.NEVER)
+ .optionalStart()
+ .appendLiteral(':')
+ .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral(':')
+ .appendValue(SECOND_OF_MINUTE, 2)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral('.')
+ .appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
+ .optionalEnd()
+ .optionalStart()
+ .append(TIME_ZONE_FORMATTER)
+ .optionalEnd()
+ .toFormatter(Locale.US)
+ //.withZone(ZoneId.of("GMT")) see TIKA-3735
+ .withResolverStyle(ResolverStyle.LENIENT)
+ .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_DAY,
+ MINUTE_OF_HOUR,
+ SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
+ public static final DateTimeFormatter MM_SLASH_DD_SLASH_YY_HH_MM_AM_PM =
+ // US-based month/day ordering !!!! e.g. 7/9/2012 10:10:10 AM UTC"
+ new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendValue(MONTH_OF_YEAR, 1, 2, SignStyle.NEVER)
+ .appendLiteral('/')
+ .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
+ .appendLiteral('/')
+ .appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
+ .appendLiteral(' ')
+ .appendValue(HOUR_OF_AMPM, 1, 2, SignStyle.NEVER)
+ .optionalStart()
+ .appendLiteral(':')
+ .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral(':')
+ .appendValue(SECOND_OF_MINUTE, 2)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral('.')
+ .appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral(' ')
+ .optionalEnd()
+ .appendText(AMPM_OF_DAY)
+ .optionalStart()
+ .append(TIME_ZONE_FORMATTER)
+ .optionalEnd()
+ .toFormatter(Locale.US)
+ //.withZone(ZoneId.of("GMT")) see TIKA-3735
+ .withResolverStyle(ResolverStyle.LENIENT)
+ .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_AMPM,
+ AMPM_OF_DAY,
+ MINUTE_OF_HOUR,
+ SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
+
+ public static final DateTimeFormatter YYYY_MM_DD_HH_MM = // "2012-10-10 10:10:10 UTC"
+ new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendValue(YEAR, 4)
+ .appendLiteral('-')
+ .appendValue(MONTH_OF_YEAR, 2, 2, SignStyle.NEVER)
+ .appendLiteral('-')
+ .appendValue(DAY_OF_MONTH, 2, 2, SignStyle.NEVER)
+ .appendLiteral(' ')
+ .appendValue(HOUR_OF_DAY, 1, 2, SignStyle.NEVER)
+ .appendLiteral(':')
+ .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
+ .optionalStart()
+ .appendLiteral(':')
+ .appendValue(SECOND_OF_MINUTE, 2)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral('.')
+ .appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
+ .optionalEnd()
+ .optionalStart()
+ .append(TIME_ZONE_FORMATTER)
+ .optionalEnd()
+ .toFormatter(Locale.US)
+ //.withZone(ZoneId.of("GMT")) see TIKA-3735
+ .withResolverStyle(ResolverStyle.LENIENT)
+ .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_DAY,
+ MINUTE_OF_HOUR,
+ SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
+
+ public static final DateTimeFormatter YYYY_MM_DD = // "2012-10-10"
+ new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendValue(YEAR, 4)
+ .appendLiteral('-')
+ .appendValue(MONTH_OF_YEAR, 2, 2, SignStyle.NEVER)
+ .appendLiteral('-')
+ .appendValue(DAY_OF_MONTH, 2, 2, SignStyle.NEVER)
+ .toFormatter(Locale.US)
+ //.withZone(ZoneId.of("GMT")) see TIKA-3735
+ .withResolverStyle(ResolverStyle.LENIENT)
+ .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR);
+
+ public static final DateTimeFormatter MM_SLASH_DD_SLASH_YYYY =
+ new DateTimeFormatterBuilder()
+ .appendPattern("M/d/")
+ .appendValueReduced(ChronoField.YEAR, 2, 4, INITIAL_YEAR)
+ .toFormatter(Locale.US).withZone(MIDDAY.toZoneId());
+
+ public static final DateTimeFormatter DD_SLASH_MM_SLASH_YYYY =
+ new DateTimeFormatterBuilder()
+ .appendPattern("d/M/")
+ .appendValueReduced(ChronoField.YEAR, 2, 4, INITIAL_YEAR)
+ .toFormatter(Locale.US).withZone(MIDDAY.toZoneId());
+ public static final DateTimeFormatter MMM_DD_YY =
+ new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendPattern("MMM")
+ .appendLiteral(' ')
+ .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
+ .appendLiteral(' ')
+ .appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
+ .toFormatter(Locale.US);
+
+ public static final DateTimeFormatter DD_MMM_YY =
+ new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
+ .appendLiteral(' ')
+ .appendPattern("MMM")
+ .appendLiteral(' ')
+ .appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
+ .toFormatter(Locale.US);
+
+ public static final DateTimeFormatter YY_SLASH_MM_SLASH_DD =
+ new DateTimeFormatterBuilder()
+ .appendValueReduced(ChronoField.YEAR, 2, 4, INITIAL_YEAR)
+ .appendPattern("/M/d")
+ .toFormatter(Locale.US).withZone(MIDDAY.toZoneId());
+
+
+ private static final DateTimeFormatter[] DATE_FORMATTERS = new DateTimeFormatter[] {
+ DD_MMM_YY,
+ MMM_DD_YY,
+ YYYY_MM_DD,
+ MM_SLASH_DD_SLASH_YYYY,//try American first?
+ DD_SLASH_MM_SLASH_YYYY,//if that fails, try rest of world?
+ YY_SLASH_MM_SLASH_DD
+ };
+
+
+
+ private static final DateTimeFormatter[] DATE_TIME_FORMATTERS = new DateTimeFormatter[] {
+ RFC_5322_LENIENT,
+ RFC_5322_AMPM_LENIENT,
+ MMM_D_YYYY_HH_MM,
+ MMM_D_YYYY_HH_MM_AM_PM,
+ YYYY_MM_DD_HH_MM,
+ MM_SLASH_DD_SLASH_YY_HH_MM,
+ MM_SLASH_DD_SLASH_YY_HH_MM_AM_PM
+
+ };
+ public static Date parseRFC5322(String string) throws ParseException {
+ //this fails on: MON, 9 MAY 2016 3:32:00 GMT+0200 ... it stops short and doesn't include
+ // the +0200?!
+ if (string != null) {
+ string = string.trim();
+ string = string.toUpperCase(Locale.US);
+ }
+ return Date.from(Instant.from(RFC_5322.parse(string, new ParsePosition(0))));
+ }
+
+ public static Date parseDateLenient(String text) {
+ if (text == null) {
+ return null;
+ }
+ String normalized = normalize(text);
+ for (DateTimeFormatter dateTimeFormatter : DATE_TIME_FORMATTERS) {
+ try {
+ ZonedDateTime zonedDateTime = ZonedDateTime.parse(normalized, dateTimeFormatter);
+ return Date.from(Instant.from(zonedDateTime));
+ } catch (SecurityException e) {
+ throw e;
+ } catch (DateTimeParseException e) {
+
+ //There's a bug in java 8 that if we include .withZone in the DateTimeFormatter,
+ //that will override the offset/timezone id even if it included
+ // in the original string. This is fixed in later versions of Java.
+ // Once we move to Java 11, we can get rid of this. Can't make this up...
+ try {
+ LocalDateTime localDateTime = LocalDateTime.parse(normalized, dateTimeFormatter);
+ return Date.from(Instant.from(localDateTime.atOffset(UTC)));
+ } catch (SecurityException e2) {
+ throw e2;
+ } catch (Exception e2) {
+ //swallow
+ }
+ } catch (Exception e) {
+ //can get StringIndexOutOfBoundsException because of a bug in java 8
+ //ignore
+ }
+ }
+
+
+ for (DateTimeFormatter dateFormatter : DATE_FORMATTERS) {
+ try {
+ TemporalAccessor temporalAccessor = dateFormatter.parse(normalized);
+ ZonedDateTime localDate = LocalDate.from(temporalAccessor)
+ .atStartOfDay()
+ .atZone(MIDDAY.toZoneId());
+ return Date.from(Instant.from(localDate));
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ //ignore
+ }
+ }
+ return null;
+ }
+
+ private static boolean hasInstantSeconds(TemporalAccessor temporalAccessor) {
+ try {
+ temporalAccessor.getLong(INSTANT_SECONDS);
+ return true;
+ } catch (DateTimeException e) {
+ return false;
+ }
+ }
+
+ private static String normalize(String text) {
+
+ text = text.toUpperCase(Locale.US);
+
+ //strip out commas
+ text = text.replaceAll(",", "");
+
+ //strip off extra stuff after +0800, e.g. "Mon, 9 May 2016 7:32:00 UTC+0600 (BST)",
+ Matcher matcher = OFFSET_PATTERN.matcher(text);
+ if (matcher.find()) {
+ text = text.substring(0, matcher.end());
+ }
+
+ matcher = LOCALIZED_OFFSET_PATTERN.matcher(text);
+ if (matcher.find()) {
+ text = buildLocalizedOffset(matcher, text);
+ }
+
+ matcher = AM_PM.matcher(text);
+ if (matcher.find()) {
+ text = matcher.replaceFirst("$1 $2");
+ }
+ //The rfc_lenient parser had a problem parsing dates
+ //with days of week missing and a timezone: 9 May 2016 01:32:00 UTC
+ //The day of week is not used in the resolvers, so we may as well throw
+ //out that info
+ matcher = DAYS_OF_WEEK.matcher(text);
+ if (matcher.find()) {
+ text = matcher.replaceAll(" ");
+ }
+ //16 May 2016 at 09:30:32 GMT+1
+ text = text.replaceAll("(?i) at ", " ");
+ //just cause
+ text = text.replaceAll("\\s+", " ").trim();
+ return text;
+ }
+
+ private static String buildLocalizedOffset(Matcher matcher, String text) {
+ StringBuilder sb = new StringBuilder();
+ sb.append(text.substring(0, matcher.start()));
+ sb.append(matcher.group(1));// +/-
+ sb.append(StringUtils.leftPad(matcher.group(2), 2, '0'));//HH
+ sb.append(":");
+ if (matcher.group(3) != null) {
+ sb.append(matcher.group(3));
+ } else {
+ sb.append("00");
+ }
+ sb.append(text.substring(matcher.end()));
+ return sb.toString();
+ }
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-commons/src/test/java/org/apache/tika/parser/mailcommons/MailDateParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-commons/src/test/java/org/apache/tika/parser/mailcommons/MailDateParserTest.java
new file mode 100644
index 000000000..37a7a29d3
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-commons/src/test/java/org/apache/tika/parser/mailcommons/MailDateParserTest.java
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mailcommons;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.text.DateFormat;
+import java.text.DateFormatSymbols;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Locale;
+import java.util.TimeZone;
+
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+public class MailDateParserTest {
+
+ @Test
+ public void testDateTimesWithTimeZones() throws Exception {
+ String expected = "2016-05-09T01:32:00Z";
+
+ //try with timezones
+ for (String dateString : new String[] {
+ // with timezone info:
+ "Mon, 9 May 16 01:32:00 GMT",
+ "9 May 16 01:32:00 GMT",
+ "Monday, 9 May 16 01:32:00 GMT",
+ "Mon, 9 May 2016 01:32:00 UTC",
+ "9 May 2016 01:32:00 UTC",
+ "09 May 2016 01:32:00 UTC",
+ "Mon, 9 May 2016 01:32:00Z",
+ "Mon, 9 May 2016 01:32:00 Z",
+ "Mon, 9 May 2016 01:32:00 GMT",
+ "Mon, 9 May 2016 01:32:00GMT",
+ "Mon, 9 May 2016 01:32:00 UTC",
+ "Mon, 9 May 2016 01:32:00UTC",
+
+ "Mon, 9 May 2016 3:32:00 GMT+0200",
+ "Mon, 9 May 2016 3:32:00 UTC+0200",
+ "Mon, 9 May 2016 7:32:00 UTC+0600 (BST)",
+
+ //try with leading space
+ " Mon, 9 May 2016 3:32:00 +0200",
+ " 9 May 2016 3:32:00 +0200",
+ "Mon, 9 May 2016 3:32:00 +02:00",
+ "9 May 2016 3:32:00 +02:00",
+ "Mon, 9 May 2016 3:32:00+02:00",
+ "Mon, 9 May 2016 3:32:00+0200",
+ " Sun, 8 May 2016 21:32:00 EST",
+ //need to add am/pm format times? I hope not.
+
+ }) {
+ testDate(dateString, expected, true);
+ }
+ }
+
+ @Test
+ @Disabled("for dev purposes")
+ public void oneOff() throws Exception {
+ /* SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss z");
+ System.out.println(simpleDateFormat.format(new Date()));
+ DateTimeFormatter formatter = DateTimeFormatter
+ .ofPattern("yyyy-MM-dd'T'HH:mm:ss.S OOOO")
+ .withLocale(Locale.US);
+ String date = formatter.format(ZonedDateTime.now(ZoneOffset.UTC));
+ System.out.println("String: " + date);
+ System.out.println("parsed: " + formatter.parse(date) + " from " + date);
+*/
+ String s = "Mon, 6 Sep 2010 05:25:34 -0400 (EDT)";
+ s = "Tue, 9 Jun 2009 23:58:45 -0400";
+
+ //System.out.println(RFC)
+ try {
+ //turn this back on when we upgrade
+ //System.out.println("mime4j: " + DateTimeFieldLenientImpl.RFC_5322.parse(s));
+ } catch (Exception e) {
+ System.out.println("mime4j: null");
+ }
+ try {
+ Date d = MailDateParser.parseDateLenient(s);
+ DateFormat df =
+ new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", new DateFormatSymbols(Locale.US));
+ df.setTimeZone(TimeZone.getTimeZone("UTC"));
+ String dateString = df.format(d);
+ System.out.println("dev parser lenient: " + dateString);
+ } catch (Exception e) {
+ System.out.println("dev parser lenient: null");
+ }
+ }
+
+ @Test
+ public void testDateTimesWithNoTimeZone() throws Exception {
+ String expected = "2016-05-09T01:32:00Z";
+
+ for (String dateString : new String[]{
+ "Mon, 9 May 2016 01:32:00",
+ "Monday, 9 May 2016 1:32 AM", "May 9 2016 1:32am", "May 9 2016 1:32 am",
+ "2016-05-09 01:32:00"}) {
+ testDate(dateString, expected, true);
+ }
+ }
+
+ @Test
+ public void testDates() throws Exception {
+ //now try days without times
+ String expected = "2016-05-15T12:00:00Z";
+ for (String dateString : new String[]{
+ "May 15, 2016", "Sun, 15 May 2016", "15 May 2016",
+ "2016-05-15"
+ }) {
+ testDate(dateString, expected, true);
+
+ }
+ }
+
+ @Test
+ public void testTrickyDates() throws Exception {
+ DateFormat df = new SimpleDateFormat("yyyy-MM-dd", new DateFormatSymbols(Locale.US));
+ //make sure there are no mis-parses of e.g. 90 = year 90 A.D, not 1990
+ Date date1980 = df.parse("1980-01-01");
+ Date date2010 = df.parse("2010-01-01");
+ for (String dateString : new String[]{
+ "11/14/08",
+ "1/14/08",
+ "1/2/08",
+ "12/1/2008",
+ "12/02/1996",
+ "96/1/02",
+ "96/12/02",
+ "96/12/2",
+ "1996/12/02",
+ "Mon, 29 Jan 96 14:02 GMT",
+ "7/20/95 1:12PM",
+ "08/14/2000 12:48 AM",
+ "8/4/2000 1:48 AM",
+ "06/24/2008, Tuesday, 11 AM",
+ }) {
+ Date parsedDate = MailDateParser.parseDateLenient(dateString);
+ assertNotNull(parsedDate);
+ if (parsedDate != null) {
+ assertTrue(parsedDate.getTime() > date1980.getTime(),
+ "date must be after 1980:" + dateString + " >> + " +
+ parsedDate);
+ assertTrue(parsedDate.getTime() < date2010.getTime(),
+ "date must be before 2020: " + dateString + " >> + " +
+ parsedDate);
+ }
+ }
+ //TODO: mime4j misparses these to pre 1980 dates
+ //"Wed, 27 Dec 95 11:20:40 EST",
+ //"26 Aug 00 11:14:52 EDT"
+ //
+ //We are still misparsing: 8/1/03 to a pre 1980 date
+
+ }
+
+ private void testDate(String dateString, String expected, boolean useUTC) throws Exception {
+ Date parsedDate = MailDateParser.parseDateLenient(dateString);
+ assertNotNull(parsedDate, "couldn't parse " + dateString);
+ DateFormat df =
+ new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", new DateFormatSymbols(Locale.US));
+ if (useUTC) {
+ df.setTimeZone(TimeZone.getTimeZone("UTC"));
+ }
+ String parsedDateString = df.format(parsedDate);
+ assertEquals(expected, parsedDateString, "failed to match: " + dateString);
+ }
+
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 2a5e76d7d..1ea75cda4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -16,15 +16,8 @@
*/
package org.apache.tika.parser.mail;
-import static org.apache.tika.utils.DateUtils.MIDDAY;
-import static org.apache.tika.utils.DateUtils.UTC;
-
import java.io.IOException;
import java.io.InputStream;
-import java.text.DateFormat;
-import java.text.DateFormatSymbols;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
@@ -32,9 +25,6 @@ import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Stack;
-import java.util.TimeZone;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
@@ -47,7 +37,6 @@ import org.apache.james.mime4j.dom.address.AddressList;
import org.apache.james.mime4j.dom.address.Mailbox;
import org.apache.james.mime4j.dom.address.MailboxList;
import org.apache.james.mime4j.dom.field.AddressListField;
-import org.apache.james.mime4j.dom.field.DateTimeField;
import org.apache.james.mime4j.dom.field.MailboxListField;
import org.apache.james.mime4j.dom.field.ParsedField;
import org.apache.james.mime4j.dom.field.UnstructuredField;
@@ -71,6 +60,7 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.csv.TextAndCSVParser;
import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.parser.mailcommons.MailDateParser;
import org.apache.tika.parser.mailcommons.MailUtil;
import org.apache.tika.parser.txt.TXTParser;
import org.apache.tika.sax.BodyContentHandler;
@@ -86,46 +76,6 @@ class MailContentHandler implements ContentHandler {
private static final String MULTIPART_ALTERNATIVE = "multipart/alternative";
- //TIKA-1970 Mac Mail's format
- private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN =
- Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z");
-
- //find a time ending in am/pm without a space: 10:30am and
- //use this pattern to insert space: 10:30 am
- private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b");
-
- private static final DateFormatInfo[] ALTERNATE_DATE_FORMATS = new DateFormatInfo[] {
- //note that the string is "cleaned" before processing:
- //1) condense multiple whitespace to single space
- //2) trim()
- //3) strip out commas
- //4) insert space before am/pm
- new DateFormatInfo("MMM dd yy hh:mm a"),
-
- //this is a standard pattern handled by mime4j;
- //but mime4j fails with leading whitespace
- new DateFormatInfo("EEE d MMM yy HH:mm:ss Z", UTC),
-
- new DateFormatInfo("EEE d MMM yy HH:mm:ss z", UTC),
-
- new DateFormatInfo("EEE d MMM yy HH:mm:ss", null),// no timezone
-
- new DateFormatInfo("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM
-
- //16 May 2016 at 09:30:32 GMT+1 (Mac Mail TIKA-1970)
- new DateFormatInfo("d MMM yy 'at' HH:mm:ss z", UTC), // UTC/Zulu
-
- new DateFormatInfo("yy-MM-dd HH:mm:ss", null),
-
- new DateFormatInfo("MM/dd/yy hh:mm a", null, false),
-
- //now dates without times
- new DateFormatInfo("MMM d yy", MIDDAY, false),
- new DateFormatInfo("EEE d MMM yy", MIDDAY, false),
- new DateFormatInfo("d MMM yy", MIDDAY, false),
- new DateFormatInfo("yy/MM/dd", MIDDAY, false),
- new DateFormatInfo("MM/dd/yy", MIDDAY, false)};
-
private final XHTMLContentHandler handler;
private final Metadata metadata;
private final ParseContext parseContext;
@@ -154,45 +104,6 @@ class MailContentHandler implements ContentHandler {
this.detector = detector;
}
- private static DateFormat createDateFormat(DateFormatInfo dateFormatInfo) {
- SimpleDateFormat sdf = new SimpleDateFormat(dateFormatInfo.pattern,
- new DateFormatSymbols(Locale.US));
- if (dateFormatInfo.timeZone != null) {
- sdf.setTimeZone(dateFormatInfo.timeZone);
- }
- sdf.setLenient(dateFormatInfo.lenient);
- return sdf;
- }
-
- private static Date tryOtherDateFormats(String text) {
- if (text == null) {
- return null;
- }
- text = text.replaceAll("\\s+", " ").trim();
- //strip out commas
- text = text.replaceAll(",", "");
-
- Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text);
- if (matcher.find()) {
- text = matcher.replaceFirst("GMT$1$2:00");
- }
-
- matcher = AM_PM.matcher(text);
- if (matcher.find()) {
- text = matcher.replaceFirst("$1 $2");
- }
-
- for (DateFormatInfo formatInfo : ALTERNATE_DATE_FORMATS) {
- try {
- DateFormat format = createDateFormat(formatInfo);
- return format.parse(text);
- } catch (ParseException e) {
- //continue
- }
- }
- return null;
- }
-
@Override
public void body(BodyDescriptor body, InputStream is) throws MimeException, IOException {
// use a different metadata object
@@ -431,12 +342,16 @@ class MailContentHandler implements ContentHandler {
field.getBody());
}
} else if (fieldname.equalsIgnoreCase("Date")) {
- DateTimeField dateField = (DateTimeField) parsedField;
- Date date = dateField.getDate();
- if (date == null) {
- date = tryOtherDateFormats(field.getBody());
+ String dateBody = parsedField.getBody();
+ Date date = null;
+ try {
+ date = MailDateParser.parseDateLenient(dateBody);
+ metadata.set(TikaCoreProperties.CREATED, date);
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ //swallow
}
- metadata.set(TikaCoreProperties.CREATED, date);
} else {
metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + parsedField.getName(),
field.getBody());
@@ -649,24 +564,4 @@ class MailContentHandler implements ContentHandler {
this.bytes = bytes;
}
}
-
- private static class DateFormatInfo {
- String pattern;
- TimeZone timeZone;
- boolean lenient;
-
- public DateFormatInfo(String pattern) {
- this(pattern, null, true);
- }
-
- public DateFormatInfo(String pattern, TimeZone timeZone) {
- this(pattern, timeZone, true);
- }
-
- public DateFormatInfo(String pattern, TimeZone timeZone, boolean lenient) {
- this.pattern = pattern;
- this.timeZone = timeZone;
- this.lenient = lenient;
- }
- }
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
index a09a3fcfa..d01ae191a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
@@ -16,13 +16,12 @@
*/
package org.apache.tika.parser.mbox;
-import static org.apache.tika.parser.mailcommons.MailDateParser.parseDate;
+import static org.apache.tika.parser.mailcommons.MailDateParser.parseDateLenient;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
-import java.text.ParseException;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
@@ -194,9 +193,13 @@ public class MboxParser extends AbstractParser {
metadata.add(TikaCoreProperties.SUBJECT, headerContent);
} else if (headerTag.equalsIgnoreCase("Date")) {
try {
- Date date = parseDate(headerContent);
- metadata.set(TikaCoreProperties.CREATED, date);
- } catch (ParseException e) {
+ Date date = parseDateLenient(headerContent);
+ if (date != null) {
+ metadata.set(TikaCoreProperties.CREATED, date);
+ }
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
// ignoring date because format was not understood
}
} else if (headerTag.equalsIgnoreCase("Message-Id")) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index d87fd549b..04b9cd657 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -32,13 +32,7 @@ import static org.mockito.Mockito.verify;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
-import java.text.DateFormat;
-import java.text.DateFormatSymbols;
-import java.text.SimpleDateFormat;
-import java.util.Date;
import java.util.List;
-import java.util.Locale;
-import java.util.TimeZone;
import org.apache.james.mime4j.stream.MimeConfig;
import org.junit.jupiter.api.BeforeAll;
@@ -378,74 +372,8 @@ public class RFC822ParserTest extends TikaTest {
r = getXML("testRFC822_eml");
assertEquals("2016-05-16T08:30:32Z", r.metadata.get(TikaCoreProperties.CREATED));
-
- String expected = "2016-05-15T01:32:00Z";
-
- int dateNum = 0;
- for (String dateString : new String[] {
- // with timezone info:
- "Sun, 15 May 2016 01:32:00 UTC", " Sun, 15 May 2016 3:32:00 +0200",
- // format correctly handled by mime4j if no leading whitespace
- " Sun, 14 May 2016 20:32:00 EST",
- // no timezone info:
- "Sun, 15 May 2016 01:32:00",
- "Sunday, May 15 2016 1:32 AM", "May 15 2016 1:32am", "May 15 2016 1:32 am",
- "2016-05-15 01:32:00", }) {
- testDate(dateString, expected, dateNum++ < 3);
- }
-
- //now try days without times
- expected = "2016-05-15T12:00:00Z";
- for (String dateString : new String[]{"May 15, 2016", "Sun, 15 May 2016", "15 May 2016",}) {
- testDate(dateString, expected, true);
- }
}
- @Test
- public void testTrickyDates() throws Exception {
- DateFormat df = new SimpleDateFormat("yyyy-MM-dd", new DateFormatSymbols(Locale.US));
- //make sure there are no mis-parses of e.g. 90 = year 90 A.D, not 1990
- Date date1980 = df.parse("1980-01-01");
- for (String dateString : new String[]{"Mon, 29 Jan 96 14:02 GMT", "7/20/95 1:12pm",
- "08/14/2000 12:48 AM", "06/24/2008, Tuesday, 11 AM", "11/14/08", "12/02/1996",
- "96/12/02",}) {
- Date parsedDate = getDate(dateString);
- if (parsedDate != null) {
- assertTrue(parsedDate.getTime() > date1980.getTime(),
- "date must be after 1980:" + dateString);
- }
- }
- //TODO: mime4j misparses these to pre 1980 dates
- //"Wed, 27 Dec 95 11:20:40 EST",
- //"26 Aug 00 11:14:52 EDT"
- //
- //We are still misparsing: 8/1/03 to a pre 1980 date
-
- }
-
- private void testDate(String dateString, String expected, boolean useUTC) throws Exception {
- Date parsedDate = getDate(dateString);
- assertNotNull(parsedDate, "couldn't parse " + dateString);
- DateFormat df =
- new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", new DateFormatSymbols(Locale.US));
- if (useUTC) {
- df.setTimeZone(TimeZone.getTimeZone("UTC"));
- }
- String parsedDateString = df.format(parsedDate);
- assertEquals(expected, parsedDateString, "failed to match: " + dateString);
- }
-
- private Date getDate(String dateString) throws Exception {
- String mail = "From: dev@tika.apache.org\n" + "Date: " + dateString + "\n";
- Parser p = new RFC822Parser();
- Metadata m = new Metadata();
- try (InputStream is = TikaInputStream.get(mail.getBytes(StandardCharsets.UTF_8))) {
- p.parse(is, new DefaultHandler(), m, new ParseContext());
- }
- return m.getDate(TikaCoreProperties.CREATED);
- }
-
-
@Test
public void testMultipleSubjects() throws Exception {
//adapted from govdocs1 303710.txt
@@ -615,6 +543,9 @@ public class RFC822ParserTest extends TikaTest {
List<Metadata> metadataList = getRecursiveMetadata("testRFC822-ARC");
assertEquals(1, metadataList.size());
assertEquals("message/rfc822", metadataList.get(0).get(Metadata.CONTENT_TYPE));
+
+ //Also, test that this date has been parsed: Wed, 26 Jan 2022 09:14:37 +0100 (CET)
+ assertTrue(metadataList.get(0).get(TikaCoreProperties.CREATED).startsWith("2022-01-"));
}
@Test
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 4e9ce90e2..5e8b0e77a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -23,7 +23,6 @@ import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
-import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
@@ -257,10 +256,12 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
// See if we can parse it as a normal mail date
try {
- Date d = MailDateParser.parseDate(date);
+ Date d = MailDateParser.parseDateLenient(date);
metadata.set(TikaCoreProperties.CREATED, d);
metadata.set(TikaCoreProperties.MODIFIED, d);
- } catch (ParseException e) {
+ } catch (SecurityException e ) {
+ throw e;
+ } catch (Exception e) {
// Store it as-is, and hope for the best...
metadata.set(TikaCoreProperties.CREATED, date);
metadata.set(TikaCoreProperties.MODIFIED, date);