You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/13 14:41:51 UTC

[tika] 02/02: TIKA-3472 -- simple date format is not threadsafe

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit a4b196a0c28eacd3c85054f7b969cf55476c5d7a
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jul 13 10:41:28 2021 -0400

    TIKA-3472 -- simple date format is not threadsafe
---
 .../tika/parser/mail/MailContentHandler.java       | 72 +++++++++++++---------
 1 file changed, 44 insertions(+), 28 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 8dde6ec..e4f3697 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -94,39 +94,38 @@ class MailContentHandler implements ContentHandler {
     //use this pattern to insert space: 10:30 am
     private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b");
 
-    private static final DateFormat[] ALTERNATE_DATE_FORMATS = new DateFormat[]{
+    private static final DateFormatInfo[] ALTERNATE_DATE_FORMATS = new DateFormatInfo[] {
             //note that the string is "cleaned" before processing:
             //1) condense multiple whitespace to single space
             //2) trim()
             //3) strip out commas
             //4) insert space before am/pm
-
-            //May 16 2016 1:32am
-            createDateFormat("MMM dd yy hh:mm a", null),
+            new DateFormatInfo("MMM dd yy hh:mm a"),
 
             //this is a standard pattern handled by mime4j;
             //but mime4j fails with leading whitespace
-            createDateFormat("EEE d MMM yy HH:mm:ss Z", UTC),
+            new DateFormatInfo("EEE d MMM yy HH:mm:ss Z", UTC),
 
-            createDateFormat("EEE d MMM yy HH:mm:ss z", UTC),
+            new DateFormatInfo("EEE d MMM yy HH:mm:ss z", UTC),
 
-            createDateFormat("EEE d MMM yy HH:mm:ss", null),// no timezone
+            new DateFormatInfo("EEE d MMM yy HH:mm:ss", null),// no timezone
 
-            createDateFormat("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM
+            new DateFormatInfo("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM
 
             //16 May 2016 at 09:30:32  GMT+1 (Mac Mail TIKA-1970)
-            createDateFormat("d MMM yy 'at' HH:mm:ss z", UTC),   // UTC/Zulu
+            new DateFormatInfo("d MMM yy 'at' HH:mm:ss z", UTC),   // UTC/Zulu
 
-            createDateFormat("yy-MM-dd HH:mm:ss", null),
+            new DateFormatInfo("yy-MM-dd HH:mm:ss", null),
 
-            createDateFormat("MM/dd/yy hh:mm a", null, false),
+            new DateFormatInfo("MM/dd/yy hh:mm a", null, false),
 
             //now dates without times
-            createDateFormat("MMM d yy", MIDDAY, false),
-            createDateFormat("EEE d MMM yy", MIDDAY, false),
-            createDateFormat("d MMM yy", MIDDAY, false),
-            createDateFormat("yy/MM/dd", MIDDAY, false),
-            createDateFormat("MM/dd/yy", MIDDAY, false)};
+            new DateFormatInfo("MMM d yy", MIDDAY, false),
+            new DateFormatInfo("EEE d MMM yy", MIDDAY, false),
+            new DateFormatInfo("d MMM yy", MIDDAY, false),
+            new DateFormatInfo("yy/MM/dd", MIDDAY, false),
+            new DateFormatInfo("MM/dd/yy", MIDDAY, false)};
+
     private final XHTMLContentHandler handler;
     private final Metadata metadata;
     private final ParseContext parseContext;
@@ -155,21 +154,17 @@ class MailContentHandler implements ContentHandler {
         this.detector = detector;
     }
 
-    private static DateFormat createDateFormat(String format, TimeZone timezone) {
-        return createDateFormat(format, timezone, true);
-    }
-
-    private static DateFormat createDateFormat(String format, TimeZone timezone,
-                                               boolean isLenient) {
-        SimpleDateFormat sdf = new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
-        if (timezone != null) {
-            sdf.setTimeZone(timezone);
+    private static DateFormat createDateFormat(DateFormatInfo dateFormatInfo) {
+        SimpleDateFormat sdf = new SimpleDateFormat(dateFormatInfo.pattern,
+                new DateFormatSymbols(Locale.US));
+        if (dateFormatInfo.timeZone != null) {
+            sdf.setTimeZone(dateFormatInfo.timeZone);
         }
-        sdf.setLenient(isLenient);
+        sdf.setLenient(dateFormatInfo.lenient);
         return sdf;
     }
 
-    private static synchronized Date tryOtherDateFormats(String text) {
+    private static Date tryOtherDateFormats(String text) {
         if (text == null) {
             return null;
         }
@@ -187,8 +182,9 @@ class MailContentHandler implements ContentHandler {
             text = matcher.replaceFirst("$1 $2");
         }
 
-        for (DateFormat format : ALTERNATE_DATE_FORMATS) {
+        for (DateFormatInfo formatInfo : ALTERNATE_DATE_FORMATS) {
             try {
+                DateFormat format = createDateFormat(formatInfo);
                 return format.parse(text);
             } catch (ParseException e) {
                 //continue
@@ -653,4 +649,24 @@ class MailContentHandler implements ContentHandler {
             this.bytes = bytes;
         }
     }
+
+    private static class DateFormatInfo {
+        String pattern;
+        TimeZone timeZone;
+        boolean lenient;
+
+        public DateFormatInfo(String pattern) {
+            this(pattern, null, true);
+        }
+
+        public DateFormatInfo(String pattern, TimeZone timeZone) {
+            this(pattern, timeZone, true);
+        }
+
+        public DateFormatInfo(String pattern, TimeZone timeZone, boolean lenient) {
+            this.pattern = pattern;
+            this.timeZone = timeZone;
+            this.lenient = lenient;
+        }
+    }
 }