You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/13 14:41:49 UTC

[tika] branch main updated (020430f -> a4b196a)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 020430f  TIKA-3474 -- handle 1.x container exception key
     new c26e9c1  TIKA-3474 -- handle 1.x container exception key
     new a4b196a  TIKA-3472 -- simple date format is not threadsafe

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../pom.xml                                        |  4 ++
 .../tika-pipes-solr-integration-tests/pom.xml      |  4 ++
 tika-parent/pom.xml                                | 10 +--
 .../tika/parser/mail/MailContentHandler.java       | 72 +++++++++++++---------
 4 files changed, 57 insertions(+), 33 deletions(-)

[tika] 01/02: TIKA-3474 -- handle 1.x container exception key

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit c26e9c1572bdbcdb76d2ec87939882251a714125
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jul 13 10:21:23 2021 -0400

    TIKA-3474 -- handle 1.x container exception key
---
 .../tika-pipes-opensearch-integration-tests/pom.xml            |  4 ++++
 .../tika-pipes-solr-integration-tests/pom.xml                  |  4 ++++
 tika-parent/pom.xml                                            | 10 +++++-----
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/pom.xml b/tika-integration-tests/tika-pipes-opensearch-integration-tests/pom.xml
index f687b59..1b5ea08 100644
--- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/pom.xml
+++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/pom.xml
@@ -45,6 +45,10 @@
           <groupId>com.fasterxml.jackson.core</groupId>
           <artifactId>jackson-annotations</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.apache.commons</groupId>
+          <artifactId>commons-compress</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
     <dependency>
diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/pom.xml b/tika-integration-tests/tika-pipes-solr-integration-tests/pom.xml
index 82b2b9a..e250427 100644
--- a/tika-integration-tests/tika-pipes-solr-integration-tests/pom.xml
+++ b/tika-integration-tests/tika-pipes-solr-integration-tests/pom.xml
@@ -45,6 +45,10 @@
           <groupId>com.fasterxml.jackson.core</groupId>
           <artifactId>jackson-annotations</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.apache.commons</groupId>
+          <artifactId>commons-compress</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
     <dependency>
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 5183d87..40c4cec 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -278,8 +278,8 @@
     <rat.version>0.13</rat.version>
 
     <!-- dependency versions -->
-    <aws.s3.version>1.12.18</aws.s3.version>
-    <aws.transcribe.version>1.12.18</aws.transcribe.version>
+    <aws.s3.version>1.12.22</aws.s3.version>
+    <aws.transcribe.version>1.12.22</aws.transcribe.version>
     <asm.version>9.2</asm.version>
     <boilerpipe.version>1.1.0</boilerpipe.version>
     <!-- used by POI, PDFBox and Jackcess ...try to sync -->
@@ -289,7 +289,7 @@
     <commons.cli.version>1.4</commons.cli.version>
     <commons.codec.version>1.15</commons.codec.version>
     <commons.collections4.version>4.4</commons.collections4.version>
-    <commons.compress.version>1.20</commons.compress.version>
+    <commons.compress.version>1.21</commons.compress.version>
     <commons.csv.version>1.8</commons.csv.version>
     <commons.exec.version>1.3</commons.exec.version>
     <commons.io.version>2.10.0</commons.io.version>
@@ -309,7 +309,7 @@
     <imageio.version>1.4.0</imageio.version>
     <jackcess.version>4.0.1</jackcess.version>
     <jackcess.encrypt.version>4.0.1</jackcess.encrypt.version>
-    <jackrabbit.version>2.21.6</jackrabbit.version>
+    <jackrabbit.version>2.21.7</jackrabbit.version>
     <jackson.version>2.12.4</jackson.version>
     <javax.annotation.version>1.3.2</javax.annotation.version>
     <javax.jcr.version>2.0</javax.jcr.version>
@@ -318,7 +318,7 @@
     <jbig2.version>3.0.3</jbig2.version>
     <jdom2.version>2.0.6</jdom2.version>
     <jempbox.version>1.8.16</jempbox.version>
-    <jetty.version>9.4.42.v20210604</jetty.version>
+    <jetty.version>9.4.43.v20210629</jetty.version>
     <jhighlight.version>1.0.3</jhighlight.version>
     <jna.version>5.8.0</jna.version>
     <joda.time.version>2.10.10</joda.time.version>

[tika] 02/02: TIKA-3472 -- simple date format is not threadsafe

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit a4b196a0c28eacd3c85054f7b969cf55476c5d7a
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jul 13 10:41:28 2021 -0400

    TIKA-3472 -- simple date format is not threadsafe
---
 .../tika/parser/mail/MailContentHandler.java       | 72 +++++++++++++---------
 1 file changed, 44 insertions(+), 28 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 8dde6ec..e4f3697 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -94,39 +94,38 @@ class MailContentHandler implements ContentHandler {
     //use this pattern to insert space: 10:30 am
     private static final Pattern AM_PM = Pattern.compile("(?i)(\\d)([ap]m)\\b");
 
-    private static final DateFormat[] ALTERNATE_DATE_FORMATS = new DateFormat[]{
+    private static final DateFormatInfo[] ALTERNATE_DATE_FORMATS = new DateFormatInfo[] {
             //note that the string is "cleaned" before processing:
             //1) condense multiple whitespace to single space
             //2) trim()
             //3) strip out commas
             //4) insert space before am/pm
-
-            //May 16 2016 1:32am
-            createDateFormat("MMM dd yy hh:mm a", null),
+            new DateFormatInfo("MMM dd yy hh:mm a"),
 
             //this is a standard pattern handled by mime4j;
             //but mime4j fails with leading whitespace
-            createDateFormat("EEE d MMM yy HH:mm:ss Z", UTC),
+            new DateFormatInfo("EEE d MMM yy HH:mm:ss Z", UTC),
 
-            createDateFormat("EEE d MMM yy HH:mm:ss z", UTC),
+            new DateFormatInfo("EEE d MMM yy HH:mm:ss z", UTC),
 
-            createDateFormat("EEE d MMM yy HH:mm:ss", null),// no timezone
+            new DateFormatInfo("EEE d MMM yy HH:mm:ss", null),// no timezone
 
-            createDateFormat("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM
+            new DateFormatInfo("EEEEE MMM d yy hh:mm a", null),// Sunday, May 15 2016 1:32 PM
 
             //16 May 2016 at 09:30:32  GMT+1 (Mac Mail TIKA-1970)
-            createDateFormat("d MMM yy 'at' HH:mm:ss z", UTC),   // UTC/Zulu
+            new DateFormatInfo("d MMM yy 'at' HH:mm:ss z", UTC),   // UTC/Zulu
 
-            createDateFormat("yy-MM-dd HH:mm:ss", null),
+            new DateFormatInfo("yy-MM-dd HH:mm:ss", null),
 
-            createDateFormat("MM/dd/yy hh:mm a", null, false),
+            new DateFormatInfo("MM/dd/yy hh:mm a", null, false),
 
             //now dates without times
-            createDateFormat("MMM d yy", MIDDAY, false),
-            createDateFormat("EEE d MMM yy", MIDDAY, false),
-            createDateFormat("d MMM yy", MIDDAY, false),
-            createDateFormat("yy/MM/dd", MIDDAY, false),
-            createDateFormat("MM/dd/yy", MIDDAY, false)};
+            new DateFormatInfo("MMM d yy", MIDDAY, false),
+            new DateFormatInfo("EEE d MMM yy", MIDDAY, false),
+            new DateFormatInfo("d MMM yy", MIDDAY, false),
+            new DateFormatInfo("yy/MM/dd", MIDDAY, false),
+            new DateFormatInfo("MM/dd/yy", MIDDAY, false)};
+
     private final XHTMLContentHandler handler;
     private final Metadata metadata;
     private final ParseContext parseContext;
@@ -155,21 +154,17 @@ class MailContentHandler implements ContentHandler {
         this.detector = detector;
     }
 
-    private static DateFormat createDateFormat(String format, TimeZone timezone) {
-        return createDateFormat(format, timezone, true);
-    }
-
-    private static DateFormat createDateFormat(String format, TimeZone timezone,
-                                               boolean isLenient) {
-        SimpleDateFormat sdf = new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
-        if (timezone != null) {
-            sdf.setTimeZone(timezone);
+    private static DateFormat createDateFormat(DateFormatInfo dateFormatInfo) {
+        SimpleDateFormat sdf = new SimpleDateFormat(dateFormatInfo.pattern,
+                new DateFormatSymbols(Locale.US));
+        if (dateFormatInfo.timeZone != null) {
+            sdf.setTimeZone(dateFormatInfo.timeZone);
         }
-        sdf.setLenient(isLenient);
+        sdf.setLenient(dateFormatInfo.lenient);
         return sdf;
     }
 
-    private static synchronized Date tryOtherDateFormats(String text) {
+    private static Date tryOtherDateFormats(String text) {
         if (text == null) {
             return null;
         }
@@ -187,8 +182,9 @@ class MailContentHandler implements ContentHandler {
             text = matcher.replaceFirst("$1 $2");
         }
 
-        for (DateFormat format : ALTERNATE_DATE_FORMATS) {
+        for (DateFormatInfo formatInfo : ALTERNATE_DATE_FORMATS) {
             try {
+                DateFormat format = createDateFormat(formatInfo);
                 return format.parse(text);
             } catch (ParseException e) {
                 //continue
@@ -653,4 +649,24 @@ class MailContentHandler implements ContentHandler {
             this.bytes = bytes;
         }
     }
+
+    private static class DateFormatInfo {
+        String pattern;
+        TimeZone timeZone;
+        boolean lenient;
+
+        public DateFormatInfo(String pattern) {
+            this(pattern, null, true);
+        }
+
+        public DateFormatInfo(String pattern, TimeZone timeZone) {
+            this(pattern, timeZone, true);
+        }
+
+        public DateFormatInfo(String pattern, TimeZone timeZone, boolean lenient) {
+            this.pattern = pattern;
+            this.timeZone = timeZone;
+            this.lenient = lenient;
+        }
+    }
 }