You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/01/18 22:49:09 UTC
[tika] branch TIKA-3957 updated: TIKA-3957 -- this is in better shape...
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3957
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-3957 by this push:
new 65346f5db TIKA-3957 -- this is in better shape...
65346f5db is described below
commit 65346f5dbfc6d923a4776204e44ac70a35ac3df6
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jan 18 17:48:54 2023 -0500
TIKA-3957 -- this is in better shape...
---
.../tika/parser/mailcommons/MailDateParser.java | 455 ++++++++++++++++-----
.../parser/mailcommons/MailDateParserTest.java | 93 +++--
.../tika/parser/mail/MailContentHandler.java | 76 +---
.../org/apache/tika/parser/mbox/MboxParser.java | 13 +-
.../apache/tika/parser/mail/RFC822ParserTest.java | 9 +-
.../tika/parser/microsoft/OutlookExtractor.java | 7 +-
6 files changed, 438 insertions(+), 215 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-commons/src/main/java/org/apache/tika/parser/mailcommons/MailDateParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-commons/src/main/java/org/apache/tika/parser/mailcommons/MailDateParser.java
index a3a031b2e..3a5220710 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-commons/src/main/java/org/apache/tika/parser/mailcommons/MailDateParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-commons/src/main/java/org/apache/tika/parser/mailcommons/MailDateParser.java
@@ -17,8 +17,10 @@
package org.apache.tika.parser.mailcommons;
import static java.time.ZoneOffset.UTC;
+import static java.time.temporal.ChronoField.AMPM_OF_DAY;
import static java.time.temporal.ChronoField.DAY_OF_MONTH;
import static java.time.temporal.ChronoField.DAY_OF_WEEK;
+import static java.time.temporal.ChronoField.HOUR_OF_AMPM;
import static java.time.temporal.ChronoField.HOUR_OF_DAY;
import static java.time.temporal.ChronoField.INSTANT_SECONDS;
import static java.time.temporal.ChronoField.MILLI_OF_SECOND;
@@ -29,24 +31,20 @@ import static java.time.temporal.ChronoField.SECOND_OF_MINUTE;
import static java.time.temporal.ChronoField.YEAR;
import static org.apache.tika.utils.DateUtils.MIDDAY;
-import java.text.DateFormatSymbols;
import java.text.ParseException;
import java.text.ParsePosition;
import java.time.DateTimeException;
import java.time.Instant;
import java.time.LocalDate;
-import java.time.ZoneId;
+import java.time.LocalDateTime;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder;
+import java.time.format.DateTimeParseException;
import java.time.format.ResolverStyle;
import java.time.format.SignStyle;
import java.time.temporal.ChronoField;
import java.time.temporal.TemporalAccessor;
-import java.time.temporal.TemporalField;
-import java.time.temporal.TemporalQueries;
-import java.time.temporal.TemporalQuery;
-import java.time.temporal.TemporalUnit;
import java.util.Date;
import java.util.HashMap;
import java.util.Locale;
@@ -54,13 +52,30 @@ import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.tika.utils.DateUtils;
+import org.apache.tika.utils.StringUtils;
+/**
+ * Dates in emails are a mess. There are at least two major date related bugs in JDK 8.
+ * This class does its best to parse date strings. It does have a US-based date bias.
+ * Please open a ticket to fix this. We can also add overrides via the parser config
+ * to manage custom dates.
+ */
public class MailDateParser {
- //TIKA-1970 Mac Mail's format
- private static final Pattern GENERAL_TIME_ZONE_NO_MINUTES_PATTERN =
- Pattern.compile("(?:UTC|GMT)([+-])(\\d?\\d)\\Z");
+ //TIKA-1970 Mac Mail's format is GMT+1 so we need to check for hour only
+ //Also, there are numerous bugs in jdk 8 with localized offsets
+ //so we need to get rid of the GMT/UTC component (e.g. https://bugs.openjdk.org/browse/JDK-8154520)
+ private static final Pattern LOCALIZED_OFFSET_PATTERN =
+ Pattern.compile("(?:UTC|GMT)\\s*([-+])\\s*(\\d?\\d):?(\\d\\d)?\\Z");
+
+ //this is used to strip junk after a fairly full offset:
+ // Wed, 26 Jan 2022 09:14:37 +0100 (CET)
+ private static final Pattern OFFSET_PATTERN =
+ Pattern.compile("[-+]\\s*\\d?\\d:?\\d\\d");
+
+ private static final Pattern DAYS_OF_WEEK =
+ Pattern.compile("(?:\\A| )(MON|MONDAY|TUE|TUES|TUESDAY|WED|WEDNESDAY|THU|THUR|THURS" +
+ "|THURSDAY|FRI|FRIDAY|SAT|SATURDAY|SUN|SUNDAY) ");
//find a time ending in am/pm without a space: 10:30am and
//use this pattern to insert space: 10:30 am
@@ -97,6 +112,25 @@ public class MailDateParser {
}
private static final int INITIAL_YEAR = 1970;
+
+ private static final DateTimeFormatter TIME_ZONE_FORMATTER
+ = new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .optionalStart()
+ .appendLiteral(' ') //optional space before any of the time zone offset/ids
+ .optionalEnd()
+ .optionalStart()
+ .appendZoneId()
+ .optionalEnd()
+ .optionalStart()
+ .appendPattern("X")//localized zone offset, e.g. Z; -08; -0830; -08:30; -083015; -08:30:15
+ .optionalEnd()
+ .optionalStart()
+ .appendPattern("z")//zone name, e.g. PST
+ .optionalEnd().toFormatter(Locale.US);
+
+
public static final DateTimeFormatter RFC_5322 = new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.parseLenient()
@@ -134,14 +168,6 @@ public class MailDateParser {
public static final DateTimeFormatter RFC_5322_LENIENT = new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.parseLenient()
- .optionalStart()
- .appendPattern("EEEEE")
- .appendLiteral(' ')
- .optionalEnd()
- .optionalStart()
- .appendPattern("E")
- .appendLiteral(' ')
- .optionalEnd()
.appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
.appendLiteral(' ')
.appendPattern("MMM")
@@ -160,65 +186,281 @@ public class MailDateParser {
.appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
.optionalEnd()
.optionalStart()
- .optionalStart()
- .appendLiteral(' ')
+ .append(TIME_ZONE_FORMATTER)
.optionalEnd()
- .appendOffset("+HHMM", "GMT")
- .optionalEnd()
- .optionalStart()
- .optionalStart()
+ .toFormatter(Locale.US)
+ //.withZone(ZoneId.of("GMT")) see TIKA-3735
+ .withResolverStyle(ResolverStyle.LENIENT)
+ .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR,
+ HOUR_OF_DAY, MINUTE_OF_HOUR,
+ SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
+
+
+ //this differs only from RFC_5322_LENIENT in requiring am/pm
+ public static final DateTimeFormatter RFC_5322_AMPM_LENIENT = new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
.appendLiteral(' ')
- .optionalEnd()
+ .appendPattern("MMM")
+ .appendLiteral(' ')
+ .appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
+ .appendLiteral(' ')
+ .appendValue(ChronoField.HOUR_OF_AMPM, 1, 2, SignStyle.NEVER)
+ .appendLiteral(':')
+ .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
.optionalStart()
- .appendZoneId()
+ .appendLiteral(':')
+ .appendValue(SECOND_OF_MINUTE, 2)
.optionalEnd()
.optionalStart()
+ .appendLiteral('.')
+ .appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
+ .optionalEnd()
.optionalStart()
- .appendLiteral(' ')
+ .appendLiteral(' ') //optional space before am/pm
.optionalEnd()
+ .appendText(ChronoField.AMPM_OF_DAY)
+ .optionalStart()
.optionalStart()
- .appendZoneRegionId()
+ .append(TIME_ZONE_FORMATTER)
.optionalEnd()
.toFormatter(Locale.US)
//.withZone(ZoneId.of("GMT")) see TIKA-3735
.withResolverStyle(ResolverStyle.LENIENT)
- .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_DAY, MINUTE_OF_HOUR,
+ .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_AMPM, AMPM_OF_DAY,
+ MINUTE_OF_HOUR,
SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
+
+ public static final DateTimeFormatter MMM_D_YYYY_HH_MM_AM_PM = // "July 9 2012 10:10:10 am UTC"
+ new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendPattern("MMM")
+ .appendLiteral(' ')
+ .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
+ .appendLiteral(' ')
+ .appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
+ .appendLiteral(' ')
+ .appendValue(ChronoField.HOUR_OF_AMPM, 1, 2, SignStyle.NEVER)
+ .appendLiteral(':')
+ .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
+ .optionalStart()
+ .appendLiteral(':')
+ .appendValue(SECOND_OF_MINUTE, 2)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral('.')
+ .appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral(' ') //optional space before am/pm
+ .optionalEnd()
+ .appendText(ChronoField.AMPM_OF_DAY)
+ .optionalStart()
+ .append(TIME_ZONE_FORMATTER)
+ .optionalEnd()
+ .toFormatter(Locale.US)
+ //.withZone(ZoneId.of("GMT")) see TIKA-3735
+ .withResolverStyle(ResolverStyle.LENIENT)
+ .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_AMPM, AMPM_OF_DAY,
+ MINUTE_OF_HOUR,
+ SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
+
+ public static final DateTimeFormatter MMM_D_YYYY_HH_MM = // "July 9 2012 10:10:10 UTC"
+ new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendPattern("MMM")
+ .appendLiteral(' ')
+ .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
+ .appendLiteral(' ')
+ .appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
+ .appendLiteral(' ')
+ .appendValue(HOUR_OF_DAY, 1, 2, SignStyle.NEVER)
+ .appendLiteral(':')
+ .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
+ .optionalStart()
+ .appendLiteral(':')
+ .appendValue(SECOND_OF_MINUTE, 2)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral('.')
+ .appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
+ .optionalEnd()
+ .optionalStart()
+ .optionalStart()
+ .append(TIME_ZONE_FORMATTER)
+ .optionalEnd()
+ .toFormatter(Locale.US)
+ //.withZone(ZoneId.of("GMT")) see TIKA-3735
+ .withResolverStyle(ResolverStyle.LENIENT)
+ .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_DAY,
+ MINUTE_OF_HOUR,
+ SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
+
+ public static final DateTimeFormatter MM_SLASH_DD_SLASH_YY_HH_MM = //
+ // US-based month/day ordering !!!! e.g. 7/9/2012 10:10:10"
+ new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendValue(MONTH_OF_YEAR, 1, 2, SignStyle.NEVER)
+ .appendLiteral('/')
+ .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
+ .appendLiteral('/')
+ .appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
+ .appendLiteral(' ')
+ .appendValue(HOUR_OF_DAY, 1, 2, SignStyle.NEVER)
+ .optionalStart()
+ .appendLiteral(':')
+ .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral(':')
+ .appendValue(SECOND_OF_MINUTE, 2)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral('.')
+ .appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
+ .optionalEnd()
+ .optionalStart()
+ .append(TIME_ZONE_FORMATTER)
+ .optionalEnd()
+ .toFormatter(Locale.US)
+ //.withZone(ZoneId.of("GMT")) see TIKA-3735
+ .withResolverStyle(ResolverStyle.LENIENT)
+ .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_DAY,
+ MINUTE_OF_HOUR,
+ SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
+ public static final DateTimeFormatter MM_SLASH_DD_SLASH_YY_HH_MM_AM_PM =
+ // US-based month/day ordering !!!! e.g. 7/9/2012 10:10:10 AM UTC"
+ new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendValue(MONTH_OF_YEAR, 1, 2, SignStyle.NEVER)
+ .appendLiteral('/')
+ .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
+ .appendLiteral('/')
+ .appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
+ .appendLiteral(' ')
+ .appendValue(HOUR_OF_AMPM, 1, 2, SignStyle.NEVER)
+ .optionalStart()
+ .appendLiteral(':')
+ .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral(':')
+ .appendValue(SECOND_OF_MINUTE, 2)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral('.')
+ .appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral(' ')
+ .optionalEnd()
+ .appendText(AMPM_OF_DAY)
+ .optionalStart()
+ .append(TIME_ZONE_FORMATTER)
+ .optionalEnd()
+ .toFormatter(Locale.US)
+ //.withZone(ZoneId.of("GMT")) see TIKA-3735
+ .withResolverStyle(ResolverStyle.LENIENT)
+ .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_AMPM,
+ AMPM_OF_DAY,
+ MINUTE_OF_HOUR,
+ SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
+
+ public static final DateTimeFormatter YYYY_MM_DD_HH_MM = // "2012-10-10 10:10:10 UTC"
+ new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendValue(YEAR, 4)
+ .appendLiteral('-')
+ .appendValue(MONTH_OF_YEAR, 2, 2, SignStyle.NEVER)
+ .appendLiteral('-')
+ .appendValue(DAY_OF_MONTH, 2, 2, SignStyle.NEVER)
+ .appendLiteral(' ')
+ .appendValue(HOUR_OF_DAY, 1, 2, SignStyle.NEVER)
+ .appendLiteral(':')
+ .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
+ .optionalStart()
+ .appendLiteral(':')
+ .appendValue(SECOND_OF_MINUTE, 2)
+ .optionalEnd()
+ .optionalStart()
+ .appendLiteral('.')
+ .appendValue(MILLI_OF_SECOND, 3, 5, SignStyle.NEVER)
+ .optionalEnd()
+ .optionalStart()
+ .append(TIME_ZONE_FORMATTER)
+ .optionalEnd()
+ .toFormatter(Locale.US)
+ //.withZone(ZoneId.of("GMT")) see TIKA-3735
+ .withResolverStyle(ResolverStyle.LENIENT)
+ .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR, HOUR_OF_DAY,
+ MINUTE_OF_HOUR,
+ SECOND_OF_MINUTE, MILLI_OF_SECOND, OFFSET_SECONDS);
+
+ public static final DateTimeFormatter YYYY_MM_DD = // "2012-10-10"
+ new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendValue(YEAR, 4)
+ .appendLiteral('-')
+ .appendValue(MONTH_OF_YEAR, 2, 2, SignStyle.NEVER)
+ .appendLiteral('-')
+ .appendValue(DAY_OF_MONTH, 2, 2, SignStyle.NEVER)
+ .toFormatter(Locale.US)
+ //.withZone(ZoneId.of("GMT")) see TIKA-3735
+ .withResolverStyle(ResolverStyle.LENIENT)
+ .withResolverFields(DAY_OF_MONTH, MONTH_OF_YEAR, YEAR);
+
public static final DateTimeFormatter MM_SLASH_DD_SLASH_YYYY =
new DateTimeFormatterBuilder()
.appendPattern("M/d/")
.appendValueReduced(ChronoField.YEAR, 2, 4, INITIAL_YEAR)
- .toFormatter().withZone(MIDDAY.toZoneId());
+ .toFormatter(Locale.US).withZone(MIDDAY.toZoneId());
public static final DateTimeFormatter DD_SLASH_MM_SLASH_YYYY =
new DateTimeFormatterBuilder()
.appendPattern("d/M/")
.appendValueReduced(ChronoField.YEAR, 2, 4, INITIAL_YEAR)
- .toFormatter().withZone(MIDDAY.toZoneId());
- public static final DateTimeFormatter MMM_D_YY =
- DateTimeFormatter.ofPattern("MMM d yy", Locale.US)
- .withZone(MIDDAY.toZoneId());
-
- public static final DateTimeFormatter EEE_D_MMM_YY =
- DateTimeFormatter.ofPattern("EEE d MMM yy", Locale.US)
- .withZone(MIDDAY.toZoneId());
+ .toFormatter(Locale.US).withZone(MIDDAY.toZoneId());
+ public static final DateTimeFormatter MMM_DD_YY =
+ new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendPattern("MMM")
+ .appendLiteral(' ')
+ .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
+ .appendLiteral(' ')
+ .appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
+ .toFormatter(Locale.US);
- public static final DateTimeFormatter D_MMM_YY =
- DateTimeFormatter.ofPattern("d MMM yy", Locale.US)
- .withZone(MIDDAY.toZoneId());
+ public static final DateTimeFormatter DD_MMM_YY =
+ new DateTimeFormatterBuilder()
+ .parseCaseInsensitive()
+ .parseLenient()
+ .appendValue(DAY_OF_MONTH, 1, 2, SignStyle.NEVER)
+ .appendLiteral(' ')
+ .appendPattern("MMM")
+ .appendLiteral(' ')
+ .appendValueReduced(YEAR, 2, 4, INITIAL_YEAR)
+ .toFormatter(Locale.US);
public static final DateTimeFormatter YY_SLASH_MM_SLASH_DD =
new DateTimeFormatterBuilder()
.appendValueReduced(ChronoField.YEAR, 2, 4, INITIAL_YEAR)
.appendPattern("/M/d")
- .toFormatter().withZone(MIDDAY.toZoneId());
+ .toFormatter(Locale.US).withZone(MIDDAY.toZoneId());
private static final DateTimeFormatter[] DATE_FORMATTERS = new DateTimeFormatter[] {
- EEE_D_MMM_YY,
- D_MMM_YY,
- MMM_D_YY,
+ DD_MMM_YY,
+ MMM_DD_YY,
+ YYYY_MM_DD,
MM_SLASH_DD_SLASH_YYYY,//try American first?
DD_SLASH_MM_SLASH_YYYY,//if that fails, try rest of world?
YY_SLASH_MM_SLASH_DD
@@ -227,41 +469,18 @@ public class MailDateParser {
private static final DateTimeFormatter[] DATE_TIME_FORMATTERS = new DateTimeFormatter[] {
- /*DateTimeFormatter.ofPattern("MMM dd yy hh:mm a", Locale.US).withZone(UTC),
- DateTimeFormatter.ofPattern("EEE d MMM yy HH:mm:ss z", Locale.US).withZone(UTC),
- DateTimeFormatter.ofPattern("EEE d MMM yy HH:mm:ss", Locale.US),
- // Sunday, May 15 2016 1:32 PM
- DateTimeFormatter.ofPattern("EEEEE MMM d yy hh:mm a", Locale.US),
- //16 May 2016 at 09:30:32 GMT+1 (Mac Mail TIKA-1970)
- DateTimeFormatter.ofPattern("d MMM yy 'at' HH:mm:ss z", Locale.US).withZone(UTC),
- DateTimeFormatter.ofPattern("yy-MM-dd HH:mm:ss", Locale.US),*/
RFC_5322_LENIENT,
-
- //this assumes US ordering M/d/ -- we need to add non-US too
- //7/20/95 1:12PM OR 7/20/95 1:12:14PM OR 06/24/2008 Tuesday 11 AM
- /*new DateTimeFormatterBuilder()
- .appendPattern("M/d/")
- .appendValueReduced(ChronoField.YEAR, 2, 4, INITIAL_YEAR)
- .optionalStart()
- .appendLiteral(' ')
- .appendText(DAY_OF_WEEK, dayOfWeekLenient())
- .optionalEnd()
- .appendLiteral(' ')
- .appendValue(ChronoField.HOUR_OF_AMPM, 1, 2, SignStyle.NEVER)
- .optionalStart()
- .appendLiteral(':')
- .appendValue(MINUTE_OF_HOUR, 1, 2, SignStyle.NEVER)
- .optionalEnd()
- .optionalStart()
- .appendLiteral(':')
- .appendValue(SECOND_OF_MINUTE, 2)
- .optionalEnd()
- .appendLiteral(' ')
- .appendText(ChronoField.AMPM_OF_DAY)
- .toFormatter().withZone(MIDDAY.toZoneId())*/
+ RFC_5322_AMPM_LENIENT,
+ MMM_D_YYYY_HH_MM,
+ MMM_D_YYYY_HH_MM_AM_PM,
+ YYYY_MM_DD_HH_MM,
+ MM_SLASH_DD_SLASH_YY_HH_MM,
+ MM_SLASH_DD_SLASH_YY_HH_MM_AM_PM
};
- public static Date parseDate(String string) throws ParseException {
+ public static Date parseRFC5322(String string) throws ParseException {
+ //this fails on: MON, 9 MAY 2016 3:32:00 GMT+0200 ... it stops short and doesn't include
+ // the +0200?!
if (string != null) {
string = string.trim();
string = string.toUpperCase(Locale.US);
@@ -273,37 +492,34 @@ public class MailDateParser {
if (text == null) {
return null;
}
- text = text.replaceAll("\\s+", " ").trim();
- text = text.toUpperCase(Locale.US);
- try {
- return parseDate(text);
- } catch (SecurityException e) {
- throw e;
- } catch (Exception e) {
- //ignore
- }
-
String normalized = normalize(text);
for (DateTimeFormatter dateTimeFormatter : DATE_TIME_FORMATTERS) {
try {
- TemporalQuery<TemporalUnit> query = TemporalQueries.precision();
-
- TemporalAccessor temporalAccessor = dateTimeFormatter.parse(normalized);
- if (hasInstantSeconds(temporalAccessor)) {
- System.out.println("precision: " + temporalAccessor.query(query));
- System.out.println(temporalAccessor.get(SECOND_OF_MINUTE));
-
- System.out.println(temporalAccessor.getClass() + " : " + temporalAccessor);
- }
- return Date.from(Instant.from(dateTimeFormatter.parse(normalized)));
+ ZonedDateTime zonedDateTime = ZonedDateTime.parse(normalized, dateTimeFormatter);
+ return Date.from(Instant.from(zonedDateTime));
} catch (SecurityException e) {
throw e;
+ } catch (DateTimeParseException e) {
+
+ //There's a bug in java 8 that if we include .withZone in the DateTimeFormatter,
+ //that will override the offset/timezone id even if it included
+ // in the original string. This is fixed in later versions of Java.
+ // Once we move to Java 11, we can get rid of this. Can't make this up...
+ try {
+ LocalDateTime localDateTime = LocalDateTime.parse(normalized, dateTimeFormatter);
+ return Date.from(Instant.from(localDateTime.atOffset(UTC)));
+ } catch (SecurityException e2) {
+ throw e2;
+ } catch (Exception e2) {
+ //swallow
+ }
} catch (Exception e) {
- System.err.println(dateTimeFormatter);
- e.printStackTrace();
+ //can get StringIndexOutOfBoundsException because of a bug in java 8
//ignore
}
}
+
+
for (DateTimeFormatter dateFormatter : DATE_FORMATTERS) {
try {
TemporalAccessor temporalAccessor = dateFormatter.parse(normalized);
@@ -314,7 +530,6 @@ public class MailDateParser {
} catch (SecurityException e) {
throw e;
} catch (Exception e) {
- //e.printStackTrace();
//ignore
}
}
@@ -331,18 +546,54 @@ public class MailDateParser {
}
private static String normalize(String text) {
+
+ text = text.toUpperCase(Locale.US);
+
//strip out commas
text = text.replaceAll(",", "");
- Matcher matcher = GENERAL_TIME_ZONE_NO_MINUTES_PATTERN.matcher(text);
+ //strip off extra stuff after +0800, e.g. "Mon, 9 May 2016 7:32:00 UTC+0600 (BST)",
+ Matcher matcher = OFFSET_PATTERN.matcher(text);
+ if (matcher.find()) {
+ text = text.substring(0, matcher.end());
+ }
+
+ matcher = LOCALIZED_OFFSET_PATTERN.matcher(text);
if (matcher.find()) {
- text = matcher.replaceFirst("GMT$1$2:00");
+ text = buildLocalizedOffset(matcher, text);
}
matcher = AM_PM.matcher(text);
if (matcher.find()) {
text = matcher.replaceFirst("$1 $2");
}
+ //The rfc_lenient parser had a problem parsing dates
+ //with days of week missing and a timezone: 9 May 2016 01:32:00 UTC
+ //The day of week is not used in the resolvers, so we may as well throw
+ //out that info
+ matcher = DAYS_OF_WEEK.matcher(text);
+ if (matcher.find()) {
+ text = matcher.replaceAll(" ");
+ }
+ //16 May 2016 at 09:30:32 GMT+1
+ text = text.replaceAll("(?i) at ", " ");
+ //just cause
+ text = text.replaceAll("\\s+", " ").trim();
return text;
}
+
+ private static String buildLocalizedOffset(Matcher matcher, String text) {
+ StringBuilder sb = new StringBuilder();
+ sb.append(text.substring(0, matcher.start()));
+ sb.append(matcher.group(1));// +/-
+ sb.append(StringUtils.leftPad(matcher.group(2), 2, '0'));//HH
+ sb.append(":");
+ if (matcher.group(3) != null) {
+ sb.append(matcher.group(3));
+ } else {
+ sb.append("00");
+ }
+ sb.append(text.substring(matcher.end()));
+ return sb.toString();
+ }
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-commons/src/test/java/org/apache/tika/parser/mailcommons/MailDateParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-commons/src/test/java/org/apache/tika/parser/mailcommons/MailDateParserTest.java
index 05ee7c19f..d98c3d464 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-commons/src/test/java/org/apache/tika/parser/mailcommons/MailDateParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-commons/src/test/java/org/apache/tika/parser/mailcommons/MailDateParserTest.java
@@ -20,8 +20,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
-import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
import java.text.DateFormat;
import java.text.DateFormatSymbols;
import java.text.SimpleDateFormat;
@@ -29,14 +27,8 @@ import java.util.Date;
import java.util.Locale;
import java.util.TimeZone;
+import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
-import org.xml.sax.helpers.DefaultHandler;
-
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
public class MailDateParserTest {
@@ -47,28 +39,79 @@ public class MailDateParserTest {
//try with timezones
for (String dateString : new String[] {
// with timezone info:
- "Thu, 9 May 16 01:32:00 GMT",
- "Thu, 9 May 2016 01:32:00 UTC",
- "Thu, 9 May 2016 01:32:00Z",
- "Thu, 9 May 2016 01:32:00 GMT",
- "Thu, 9 May 2016 01:32:00 UTC",
+ "Mon, 9 May 16 01:32:00 GMT",
+ "9 May 16 01:32:00 GMT",
+ "Monday, 9 May 16 01:32:00 GMT",
+ "Mon, 9 May 2016 01:32:00 UTC",
+ "9 May 2016 01:32:00 UTC",
+ "09 May 2016 01:32:00 UTC",
+ "Mon, 9 May 2016 01:32:00Z",
+ "Mon, 9 May 2016 01:32:00 Z",
+ "Mon, 9 May 2016 01:32:00 GMT",
+ "Mon, 9 May 2016 01:32:00GMT",
+ "Mon, 9 May 2016 01:32:00 UTC",
+ "Mon, 9 May 2016 01:32:00UTC",
+
+ "Mon, 9 May 2016 3:32:00 GMT+0200",
+ "Mon, 9 May 2016 3:32:00 UTC+0200",
+ "Mon, 9 May 2016 7:32:00 UTC+0600 (BST)",
+
//try with leading space
- " Thu, 9 May 2016 3:32:00 +0200",
- "Thu, 9 May 2016 3:32:00 +02:00",
- // format correctly handled by mime4j if no leading whitespace
- " Wed, 8 May 2016 20:32:00 EST",}) {
+ " Mon, 9 May 2016 3:32:00 +0200",
+ " 9 May 2016 3:32:00 +0200",
+ "Mon, 9 May 2016 3:32:00 +02:00",
+ "9 May 2016 3:32:00 +02:00",
+ "Mon, 9 May 2016 3:32:00+02:00",
+ "Mon, 9 May 2016 3:32:00+0200",
+ " Sun, 8 May 2016 21:32:00 EST",
+ //need to add am/pm format times? I hope not.
+
+ }) {
testDate(dateString, expected, true);
}
}
+ @Test
+ @Disabled("for dev purposes")
+ public void oneOff() throws Exception {
+ /* SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss z");
+ System.out.println(simpleDateFormat.format(new Date()));
+ DateTimeFormatter formatter = DateTimeFormatter
+ .ofPattern("yyyy-MM-dd'T'HH:mm:ss.S OOOO")
+ .withLocale(Locale.US);
+ String date = formatter.format(ZonedDateTime.now(ZoneOffset.UTC));
+ System.out.println("String: " + date);
+ System.out.println("parsed: " + formatter.parse(date) + " from " + date);
+*/
+ String s = "Mon, 6 Sep 2010 05:25:34 -0400 (EDT)";
+ s = "Tue, 9 Jun 2009 23:58:45 -0400";
+
+ //System.out.println(RFC)
+ try {//turn this back on when we upgrade
+ //System.out.println("mime4j: " + DateTimeFieldLenientImpl.RFC_5322.parse(s));
+ } catch (Exception e) {
+ System.out.println("mime4j: null");
+ }
+ try {
+ Date d = MailDateParser.parseDateLenient(s);
+ DateFormat df =
+ new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", new DateFormatSymbols(Locale.US));
+ df.setTimeZone(TimeZone.getTimeZone("UTC"));
+ String dateString = df.format(d);
+ System.out.println("dev parser lenient: " + dateString);
+ } catch (Exception e) {
+ System.out.println("dev parser lenient: null");
+ }
+ }
+
@Test
public void testDateTimesWithNoTimeZone() throws Exception {
String expected = "2016-05-09T01:32:00Z";
for (String dateString : new String[]{
- /* "Thu, 9 May 2016 01:32:00",
- "Thursday, May 9 2016 1:32 AM", "May 9 2016 1:32am", "May 9 2016 1:32 am",
- "2016-05-09 01:32:00"*/}) {
+ "Mon, 9 May 2016 01:32:00",
+ "Monday, 9 May 2016 1:32 AM", "May 9 2016 1:32am", "May 9 2016 1:32 am",
+ "2016-05-09 01:32:00"}) {
testDate(dateString, expected, true);
}
}
@@ -77,8 +120,11 @@ public class MailDateParserTest {
public void testDates() throws Exception {
//now try days without times
String expected = "2016-05-15T12:00:00Z";
- for (String dateString : new String[]{"May 15, 2016", "Sun, 15 May 2016", "15 May 2016",}) {
- testDate(dateString, expected, false);
+ for (String dateString : new String[]{
+ "May 15, 2016", "Sun, 15 May 2016", "15 May 2016",
+ "2016-05-15"
+ }) {
+ testDate(dateString, expected, true);
}
}
@@ -106,7 +152,6 @@ public class MailDateParserTest {
"06/24/2008, Tuesday, 11 AM",
}) {
Date parsedDate = MailDateParser.parseDateLenient(dateString);
- System.out.println(parsedDate);
assertNotNull(parsedDate);
if (parsedDate != null) {
assertTrue(parsedDate.getTime() > date1980.getTime(),
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 355f25143..1ea75cda4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -16,15 +16,8 @@
*/
package org.apache.tika.parser.mail;
-import static org.apache.tika.utils.DateUtils.MIDDAY;
-import static org.apache.tika.utils.DateUtils.UTC;
-
import java.io.IOException;
import java.io.InputStream;
-import java.text.DateFormat;
-import java.text.DateFormatSymbols;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
@@ -32,9 +25,6 @@ import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Stack;
-import java.util.TimeZone;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
@@ -86,38 +76,6 @@ class MailContentHandler implements ContentHandler {
private static final String MULTIPART_ALTERNATIVE = "multipart/alternative";
- private static final DateFormatInfo[] ALTERNATE_DATE_FORMATS = new DateFormatInfo[] {
- //note that the string is "cleaned" before processing:
- //1) condense multiple whitespace to single space
- //2) trim()
- //3) strip out commas
- //4) insert space before am/pm
- new DateFormatInfo("MMM dd yy hh:mm a"),
-
- //this is a standard pattern handled by mime4j;
- //but mime4j fails with leading whitespace
- new DateFormatInfo("EEE d MMM yy HH:mm:ss Z", UTC),
-
- new DateFormatInfo("EEE d MMM yy HH:mm:ss z", UTC),
-
- new DateFormatInfo("EEE d MMM yy HH:mm:ss", null),// no timezone
-
- new DateFormatInfo("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM
-
- //16 May 2016 at 09:30:32 GMT+1 (Mac Mail TIKA-1970)
- new DateFormatInfo("d MMM yy 'at' HH:mm:ss z", UTC), // UTC/Zulu
-
- new DateFormatInfo("yy-MM-dd HH:mm:ss", null),
-
- new DateFormatInfo("MM/dd/yy hh:mm a", null, false),
-
- //now dates without times
- new DateFormatInfo("MMM d yy", MIDDAY, false),
- new DateFormatInfo("EEE d MMM yy", MIDDAY, false),
- new DateFormatInfo("d MMM yy", MIDDAY, false),
- new DateFormatInfo("yy/MM/dd", MIDDAY, false),
- new DateFormatInfo("MM/dd/yy", MIDDAY, false)};
-
private final XHTMLContentHandler handler;
private final Metadata metadata;
private final ParseContext parseContext;
@@ -146,16 +104,6 @@ class MailContentHandler implements ContentHandler {
this.detector = detector;
}
- private static DateFormat createDateFormat(DateFormatInfo dateFormatInfo) {
- SimpleDateFormat sdf = new SimpleDateFormat(dateFormatInfo.pattern,
- new DateFormatSymbols(Locale.US));
- if (dateFormatInfo.timeZone != null) {
- sdf.setTimeZone(dateFormatInfo.timeZone);
- }
- sdf.setLenient(dateFormatInfo.lenient);
- return sdf;
- }
-
@Override
public void body(BodyDescriptor body, InputStream is) throws MimeException, IOException {
// use a different metadata object
@@ -398,14 +346,12 @@ class MailContentHandler implements ContentHandler {
Date date = null;
try {
date = MailDateParser.parseDateLenient(dateBody);
+ metadata.set(TikaCoreProperties.CREATED, date);
} catch (SecurityException e) {
throw e;
} catch (Exception e) {
//swallow
}
- if (date != null) {
- metadata.set(TikaCoreProperties.CREATED, date);
- }
} else {
metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + parsedField.getName(),
field.getBody());
@@ -618,24 +564,4 @@ class MailContentHandler implements ContentHandler {
this.bytes = bytes;
}
}
-
- private static class DateFormatInfo {
- String pattern;
- TimeZone timeZone;
- boolean lenient;
-
- public DateFormatInfo(String pattern) {
- this(pattern, null, true);
- }
-
- public DateFormatInfo(String pattern, TimeZone timeZone) {
- this(pattern, timeZone, true);
- }
-
- public DateFormatInfo(String pattern, TimeZone timeZone, boolean lenient) {
- this.pattern = pattern;
- this.timeZone = timeZone;
- this.lenient = lenient;
- }
- }
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
index a09a3fcfa..d01ae191a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
@@ -16,13 +16,12 @@
*/
package org.apache.tika.parser.mbox;
-import static org.apache.tika.parser.mailcommons.MailDateParser.parseDate;
+import static org.apache.tika.parser.mailcommons.MailDateParser.parseDateLenient;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
-import java.text.ParseException;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
@@ -194,9 +193,13 @@ public class MboxParser extends AbstractParser {
metadata.add(TikaCoreProperties.SUBJECT, headerContent);
} else if (headerTag.equalsIgnoreCase("Date")) {
try {
- Date date = parseDate(headerContent);
- metadata.set(TikaCoreProperties.CREATED, date);
- } catch (ParseException e) {
+ Date date = parseDateLenient(headerContent);
+ if (date != null) {
+ metadata.set(TikaCoreProperties.CREATED, date);
+ }
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
// ignoring date because format was not understood
}
} else if (headerTag.equalsIgnoreCase("Message-Id")) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index 3c2d422de..04b9cd657 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -32,13 +32,7 @@ import static org.mockito.Mockito.verify;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
-import java.text.DateFormat;
-import java.text.DateFormatSymbols;
-import java.text.SimpleDateFormat;
-import java.util.Date;
import java.util.List;
-import java.util.Locale;
-import java.util.TimeZone;
import org.apache.james.mime4j.stream.MimeConfig;
import org.junit.jupiter.api.BeforeAll;
@@ -549,6 +543,9 @@ public class RFC822ParserTest extends TikaTest {
List<Metadata> metadataList = getRecursiveMetadata("testRFC822-ARC");
assertEquals(1, metadataList.size());
assertEquals("message/rfc822", metadataList.get(0).get(Metadata.CONTENT_TYPE));
+
+ //Also, test that this date has been parsed: Wed, 26 Jan 2022 09:14:37 +0100 (CET)
+ assertTrue(metadataList.get(0).get(TikaCoreProperties.CREATED).startsWith("2022-01-"));
}
@Test
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 4e9ce90e2..5e8b0e77a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -23,7 +23,6 @@ import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
-import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
@@ -257,10 +256,12 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
// See if we can parse it as a normal mail date
try {
- Date d = MailDateParser.parseDate(date);
+ Date d = MailDateParser.parseDateLenient(date);
metadata.set(TikaCoreProperties.CREATED, d);
metadata.set(TikaCoreProperties.MODIFIED, d);
- } catch (ParseException e) {
+ } catch (SecurityException e ) {
+ throw e;
+ } catch (Exception e) {
// Store it as-is, and hope for the best...
metadata.set(TikaCoreProperties.CREATED, date);
metadata.set(TikaCoreProperties.MODIFIED, date);