You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/11 16:43:54 UTC

[tika] branch TIKA-4126 created (now c6a7e542d)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4126
in repository https://gitbox.apache.org/repos/asf/tika.git


      at c6a7e542d TIKA-4126 -- fix bug in DateUtils that stripped timezone information from incoming Calendar objects

This branch includes the following new commits:

     new c6a7e542d TIKA-4126 -- fix bug in DateUtils that stripped timezone information from incoming Calendar objects

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-4126 -- fix bug in DateUtils that stripped timezone information from incoming Calendar objects

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4126
in repository https://gitbox.apache.org/repos/asf/tika.git

commit c6a7e542db5b2ac3cd7713cb6b519330b13f1b5f
Author: tballison <ta...@apache.org>
AuthorDate: Mon Sep 11 12:43:47 2023 -0400

    TIKA-4126 -- fix bug in DateUtils that stripped timezone information from incoming Calendar objects
---
 CHANGES.txt                                        |  3 ++
 .../main/java/org/apache/tika/utils/DateUtils.java | 37 +++++++++++-----------
 .../apache/tika/parser/pdf/CustomTikaXMPTest.java  |  3 +-
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  8 ++---
 .../tika/parser/xmp/JempboxExtractorTest.java      | 10 ++++++
 5 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index c2f5b298f..9d6a3b3ad 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 2.9.1 - ??
 
+   * Fix bug in DateUtils that stripped timezone information from
+     incoming Calendar objects (TIKA-4126).
+
    * The InputStreamDigester now calculates stream length (TIKA-4016).
 
 Release 2.9.0 - 8/23/2023
diff --git a/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java b/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
index fb7883c74..a6a68fef6 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
@@ -19,6 +19,7 @@ package org.apache.tika.utils;
 import java.text.DateFormat;
 import java.text.DateFormatSymbols;
 import java.text.SimpleDateFormat;
+import java.time.temporal.ChronoUnit;
 import java.util.ArrayList;
 import java.util.Calendar;
 import java.util.Date;
@@ -66,11 +67,11 @@ public class DateUtils {
     }
 
     /**
-     * Returns a ISO 8601 representation of the given date. This method
-     * is thread safe and non-blocking.
+     * Returns a ISO 8601 representation of the given date in UTC,
+     * truncated to the seconds unit. This method is thread safe and non-blocking.
      *
      * @param date given date
-     * @return ISO 8601 date string, including timezone details
+     * @return ISO 8601 date string in UTC, truncated to the seconds unit
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a>
      */
     public static String formatDate(Date date) {
@@ -80,27 +81,25 @@ public class DateUtils {
     }
 
     /**
-     * Returns a ISO 8601 representation of the given date. This method
-     * is thread safe and non-blocking.
+     * Returns a ISO 8601 representation of the given date in UTC,
+     * truncated to the seconds unit. This method is thread safe and non-blocking.
      *
-     * @param date given date
-     * @return ISO 8601 date string, including timezone details
+     * @param date given Calendar
+     * @return ISO 8601 date string in UTC, truncated to the seconds unit
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a>
      */
     public static String formatDate(Calendar date) {
-        // Explicitly switch it into UTC before formatting
-        date.setTimeZone(UTC);
         return doFormatDate(date);
     }
-
     /**
-     * Returns a ISO 8601 representation of the given date, which is
-     * in an unknown timezone. This method is thread safe and non-blocking.
+     * Returns a ISO 8601 representation of the given date in UTC,
+     * truncated to the seconds unit. This method is thread safe and non-blocking.
      *
      * @param date given date
-     * @return ISO 8601 date string, without timezone details
+     * @return ISO 8601 date string in UTC, truncated to the seconds unit
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a>
      */
+
     public static String formatDateUnknownTimezone(Date date) {
         // Create the Calendar object in the system timezone
         Calendar calendar = GregorianCalendar.getInstance(TimeZone.getDefault(), Locale.US);
@@ -111,12 +110,14 @@ public class DateUtils {
         return formatted.substring(0, formatted.length() - 1);
     }
 
+
+    /**
+     * Returns ISO-8601 formatted time converted to UTC, truncated to the seconds place
+     * @param calendar
+     * @return
+     */
     private static String doFormatDate(Calendar calendar) {
-        return String
-                .format(Locale.ROOT, "%04d-%02d-%02dT%02d:%02d:%02dZ", calendar.get(Calendar.YEAR),
-                        calendar.get(Calendar.MONTH) + 1, calendar.get(Calendar.DAY_OF_MONTH),
-                        calendar.get(Calendar.HOUR_OF_DAY), calendar.get(Calendar.MINUTE),
-                        calendar.get(Calendar.SECOND));
+        return calendar.toInstant().truncatedTo(ChronoUnit.SECONDS).toString();
     }
 
     private List<DateFormat> loadDateFormats() {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/CustomTikaXMPTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/CustomTikaXMPTest.java
index a99117065..f63d0a3bb 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/CustomTikaXMPTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/CustomTikaXMPTest.java
@@ -64,7 +64,8 @@ public class CustomTikaXMPTest extends TikaTest {
     public void testPDFVT() throws Exception {
         Metadata metadata = extract("testPDFVT.xmp");
         assertEquals("PDF/VT-1", metadata.get(PDF.PDFVT_VERSION));
-        assertEquals("2018-08-06T12:53:12Z", metadata.getDate(PDF.PDFVT_MODIFIED).toInstant().toString());
+        assertEquals("2018-08-06T11:53:12Z",
+                metadata.getDate(PDF.PDFVT_MODIFIED).toInstant().toString());
     }
 
     private Metadata extract(String xmpFileName) throws IOException, TikaException, SAXException {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index cb37992bc..7a059a22e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1023,9 +1023,9 @@ public class PDFParserTest extends TikaTest {
                         "Preflight", "Preflight"}, m.getValues(XMPMM.HISTORY_SOFTWARE_AGENT));
 
         assertArrayEquals(
-                new String[]{"2014-03-04T23:50:41Z", "2014-03-04T23:50:42Z", "2014-03-04T23:51:34Z",
-                        "2014-03-04T23:51:36Z", "2014-03-04T23:51:37Z", "2014-03-04T23:52:22Z",
-                        "2014-03-04T23:54:48Z"}, m.getValues(XMPMM.HISTORY_WHEN));
+                new String[]{"2014-03-04T22:50:41Z", "2014-03-04T22:50:42Z", "2014-03-04T22:51:34Z",
+                        "2014-03-04T22:51:36Z", "2014-03-04T22:51:37Z", "2014-03-04T22:52:22Z",
+                        "2014-03-04T22:54:48Z"}, m.getValues(XMPMM.HISTORY_WHEN));
     }
 
     @Test
@@ -1296,7 +1296,7 @@ public class PDFParserTest extends TikaTest {
         Metadata m = metadataList.get(0);
         //these two fields derive from the basic schema in the XMP, not dublin core
         assertEquals("Hewlett-Packard MFP", m.get(XMP.CREATOR_TOOL));
-        assertEquals("1998-08-29T13:53:15Z", m.get(XMP.CREATE_DATE));
+        assertEquals("1998-08-29T14:53:15Z", m.get(XMP.CREATE_DATE));
     }
 
     @Test
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
index c131970b9..f43d83077 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
@@ -134,4 +134,14 @@ public class JempboxExtractorTest extends TikaTest {
         }
     }
 
+    @Test
+    public void testModifiedTZ() throws Exception {
+        Metadata m = new Metadata();
+        JempboxExtractor ex = new JempboxExtractor(m);
+        try (InputStream is = getResourceAsStream("/test-documents/testXMP.xmp")) {
+            ex.parse(is);
+        }
+        assertEquals("2014-03-04T22:50:41Z", m.get(XMPMM.HISTORY_WHEN));
+    }
+
 }