You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/11 21:41:06 UTC

[tika] branch main updated: TIKA-4126 -- fix bug in DateUtils that stripped timezone information from incoming Calendar objects (#1329)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 94c6d922a TIKA-4126 -- fix bug in DateUtils that stripped timezone information from incoming Calendar objects (#1329)
94c6d922a is described below

commit 94c6d922ad028074f5c1bb98d5cbacf46d4e21de
Author: Tim Allison <ta...@apache.org>
AuthorDate: Mon Sep 11 17:41:00 2023 -0400

    TIKA-4126 -- fix bug in DateUtils that stripped timezone information from incoming Calendar objects (#1329)
---
 CHANGES.txt                                        |  3 ++
 .../main/java/org/apache/tika/utils/DateUtils.java | 37 +++++++++++-----------
 .../apache/tika/parser/pdf/CustomTikaXMPTest.java  |  3 +-
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  8 ++---
 .../tika/parser/xmp/JempboxExtractorTest.java      | 10 ++++++
 5 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index c2f5b298f..9d6a3b3ad 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 2.9.1 - ??
 
+   * Fix bug in DateUtils that stripped timezone information from
+     incoming Calendar objects (TIKA-4126).
+
    * The InputStreamDigester now calculates stream length (TIKA-4016).
 
 Release 2.9.0 - 8/23/2023
diff --git a/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java b/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
index fb7883c74..a6a68fef6 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
@@ -19,6 +19,7 @@ package org.apache.tika.utils;
 import java.text.DateFormat;
 import java.text.DateFormatSymbols;
 import java.text.SimpleDateFormat;
+import java.time.temporal.ChronoUnit;
 import java.util.ArrayList;
 import java.util.Calendar;
 import java.util.Date;
@@ -66,11 +67,11 @@ public class DateUtils {
     }
 
     /**
-     * Returns a ISO 8601 representation of the given date. This method
-     * is thread safe and non-blocking.
+     * Returns a ISO 8601 representation of the given date in UTC,
+     * truncated to the seconds unit. This method is thread safe and non-blocking.
      *
      * @param date given date
-     * @return ISO 8601 date string, including timezone details
+     * @return ISO 8601 date string in UTC, truncated to the seconds unit
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a>
      */
     public static String formatDate(Date date) {
@@ -80,27 +81,25 @@ public class DateUtils {
     }
 
     /**
-     * Returns a ISO 8601 representation of the given date. This method
-     * is thread safe and non-blocking.
+     * Returns a ISO 8601 representation of the given date in UTC,
+     * truncated to the seconds unit. This method is thread safe and non-blocking.
      *
-     * @param date given date
-     * @return ISO 8601 date string, including timezone details
+     * @param date given Calendar
+     * @return ISO 8601 date string in UTC, truncated to the seconds unit
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a>
      */
     public static String formatDate(Calendar date) {
-        // Explicitly switch it into UTC before formatting
-        date.setTimeZone(UTC);
         return doFormatDate(date);
     }
-
     /**
-     * Returns a ISO 8601 representation of the given date, which is
-     * in an unknown timezone. This method is thread safe and non-blocking.
+     * Returns a ISO 8601 representation of the given date in UTC,
+     * truncated to the seconds unit. This method is thread safe and non-blocking.
      *
      * @param date given date
-     * @return ISO 8601 date string, without timezone details
+     * @return ISO 8601 date string in UTC, truncated to the seconds unit
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a>
      */
+
     public static String formatDateUnknownTimezone(Date date) {
         // Create the Calendar object in the system timezone
         Calendar calendar = GregorianCalendar.getInstance(TimeZone.getDefault(), Locale.US);
@@ -111,12 +110,14 @@ public class DateUtils {
         return formatted.substring(0, formatted.length() - 1);
     }
 
+
+    /**
+     * Returns ISO-8601 formatted time converted to UTC, truncated to the seconds place
+     * @param calendar
+     * @return
+     */
     private static String doFormatDate(Calendar calendar) {
-        return String
-                .format(Locale.ROOT, "%04d-%02d-%02dT%02d:%02d:%02dZ", calendar.get(Calendar.YEAR),
-                        calendar.get(Calendar.MONTH) + 1, calendar.get(Calendar.DAY_OF_MONTH),
-                        calendar.get(Calendar.HOUR_OF_DAY), calendar.get(Calendar.MINUTE),
-                        calendar.get(Calendar.SECOND));
+        return calendar.toInstant().truncatedTo(ChronoUnit.SECONDS).toString();
     }
 
     private List<DateFormat> loadDateFormats() {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/CustomTikaXMPTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/CustomTikaXMPTest.java
index a99117065..f63d0a3bb 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/CustomTikaXMPTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/CustomTikaXMPTest.java
@@ -64,7 +64,8 @@ public class CustomTikaXMPTest extends TikaTest {
     public void testPDFVT() throws Exception {
         Metadata metadata = extract("testPDFVT.xmp");
         assertEquals("PDF/VT-1", metadata.get(PDF.PDFVT_VERSION));
-        assertEquals("2018-08-06T12:53:12Z", metadata.getDate(PDF.PDFVT_MODIFIED).toInstant().toString());
+        assertEquals("2018-08-06T11:53:12Z",
+                metadata.getDate(PDF.PDFVT_MODIFIED).toInstant().toString());
     }
 
     private Metadata extract(String xmpFileName) throws IOException, TikaException, SAXException {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index cb37992bc..7a059a22e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1023,9 +1023,9 @@ public class PDFParserTest extends TikaTest {
                         "Preflight", "Preflight"}, m.getValues(XMPMM.HISTORY_SOFTWARE_AGENT));
 
         assertArrayEquals(
-                new String[]{"2014-03-04T23:50:41Z", "2014-03-04T23:50:42Z", "2014-03-04T23:51:34Z",
-                        "2014-03-04T23:51:36Z", "2014-03-04T23:51:37Z", "2014-03-04T23:52:22Z",
-                        "2014-03-04T23:54:48Z"}, m.getValues(XMPMM.HISTORY_WHEN));
+                new String[]{"2014-03-04T22:50:41Z", "2014-03-04T22:50:42Z", "2014-03-04T22:51:34Z",
+                        "2014-03-04T22:51:36Z", "2014-03-04T22:51:37Z", "2014-03-04T22:52:22Z",
+                        "2014-03-04T22:54:48Z"}, m.getValues(XMPMM.HISTORY_WHEN));
     }
 
     @Test
@@ -1296,7 +1296,7 @@ public class PDFParserTest extends TikaTest {
         Metadata m = metadataList.get(0);
         //these two fields derive from the basic schema in the XMP, not dublin core
         assertEquals("Hewlett-Packard MFP", m.get(XMP.CREATOR_TOOL));
-        assertEquals("1998-08-29T13:53:15Z", m.get(XMP.CREATE_DATE));
+        assertEquals("1998-08-29T14:53:15Z", m.get(XMP.CREATE_DATE));
     }
 
     @Test
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
index c131970b9..f43d83077 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
@@ -134,4 +134,14 @@ public class JempboxExtractorTest extends TikaTest {
         }
     }
 
+    @Test
+    public void testModifiedTZ() throws Exception {
+        Metadata m = new Metadata();
+        JempboxExtractor ex = new JempboxExtractor(m);
+        try (InputStream is = getResourceAsStream("/test-documents/testXMP.xmp")) {
+            ex.parse(is);
+        }
+        assertEquals("2014-03-04T22:50:41Z", m.get(XMPMM.HISTORY_WHEN));
+    }
+
 }