You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/11 21:41:06 UTC
[tika] branch main updated: TIKA-4126 -- fix bug in DateUtils that stripped timezone information from incoming Calendar objects (#1329)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 94c6d922a TIKA-4126 -- fix bug in DateUtils that stripped timezone information from incoming Calendar objects (#1329)
94c6d922a is described below
commit 94c6d922ad028074f5c1bb98d5cbacf46d4e21de
Author: Tim Allison <ta...@apache.org>
AuthorDate: Mon Sep 11 17:41:00 2023 -0400
TIKA-4126 -- fix bug in DateUtils that stripped timezone information from incoming Calendar objects (#1329)
---
CHANGES.txt | 3 ++
.../main/java/org/apache/tika/utils/DateUtils.java | 37 +++++++++++-----------
.../apache/tika/parser/pdf/CustomTikaXMPTest.java | 3 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 8 ++---
.../tika/parser/xmp/JempboxExtractorTest.java | 10 ++++++
5 files changed, 38 insertions(+), 23 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index c2f5b298f..9d6a3b3ad 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 2.9.1 - ??
+ * Fix bug in DateUtils that stripped timezone information from
+ incoming Calendar objects (TIKA-4126).
+
* The InputStreamDigester now calculates stream length (TIKA-4016).
Release 2.9.0 - 8/23/2023
diff --git a/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java b/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
index fb7883c74..a6a68fef6 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
@@ -19,6 +19,7 @@ package org.apache.tika.utils;
import java.text.DateFormat;
import java.text.DateFormatSymbols;
import java.text.SimpleDateFormat;
+import java.time.temporal.ChronoUnit;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
@@ -66,11 +67,11 @@ public class DateUtils {
}
/**
- * Returns a ISO 8601 representation of the given date. This method
- * is thread safe and non-blocking.
+ * Returns a ISO 8601 representation of the given date in UTC,
+ * truncated to the seconds unit. This method is thread safe and non-blocking.
*
* @param date given date
- * @return ISO 8601 date string, including timezone details
+ * @return ISO 8601 date string in UTC, truncated to the seconds unit
* @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a>
*/
public static String formatDate(Date date) {
@@ -80,27 +81,25 @@ public class DateUtils {
}
/**
- * Returns a ISO 8601 representation of the given date. This method
- * is thread safe and non-blocking.
+ * Returns a ISO 8601 representation of the given date in UTC,
+ * truncated to the seconds unit. This method is thread safe and non-blocking.
*
- * @param date given date
- * @return ISO 8601 date string, including timezone details
+ * @param date given Calendar
+ * @return ISO 8601 date string in UTC, truncated to the seconds unit
* @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a>
*/
public static String formatDate(Calendar date) {
- // Explicitly switch it into UTC before formatting
- date.setTimeZone(UTC);
return doFormatDate(date);
}
-
/**
- * Returns a ISO 8601 representation of the given date, which is
- * in an unknown timezone. This method is thread safe and non-blocking.
+ * Returns a ISO 8601 representation of the given date in UTC,
+ * truncated to the seconds unit. This method is thread safe and non-blocking.
*
* @param date given date
- * @return ISO 8601 date string, without timezone details
+ * @return ISO 8601 date string in UTC, truncated to the seconds unit
* @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a>
*/
+
public static String formatDateUnknownTimezone(Date date) {
// Create the Calendar object in the system timezone
Calendar calendar = GregorianCalendar.getInstance(TimeZone.getDefault(), Locale.US);
@@ -111,12 +110,14 @@ public class DateUtils {
return formatted.substring(0, formatted.length() - 1);
}
+
+ /**
+ * Returns ISO-8601 formatted time converted to UTC, truncated to the seconds place
+ * @param calendar
+ * @return
+ */
private static String doFormatDate(Calendar calendar) {
- return String
- .format(Locale.ROOT, "%04d-%02d-%02dT%02d:%02d:%02dZ", calendar.get(Calendar.YEAR),
- calendar.get(Calendar.MONTH) + 1, calendar.get(Calendar.DAY_OF_MONTH),
- calendar.get(Calendar.HOUR_OF_DAY), calendar.get(Calendar.MINUTE),
- calendar.get(Calendar.SECOND));
+ return calendar.toInstant().truncatedTo(ChronoUnit.SECONDS).toString();
}
private List<DateFormat> loadDateFormats() {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/CustomTikaXMPTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/CustomTikaXMPTest.java
index a99117065..f63d0a3bb 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/CustomTikaXMPTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/CustomTikaXMPTest.java
@@ -64,7 +64,8 @@ public class CustomTikaXMPTest extends TikaTest {
public void testPDFVT() throws Exception {
Metadata metadata = extract("testPDFVT.xmp");
assertEquals("PDF/VT-1", metadata.get(PDF.PDFVT_VERSION));
- assertEquals("2018-08-06T12:53:12Z", metadata.getDate(PDF.PDFVT_MODIFIED).toInstant().toString());
+ assertEquals("2018-08-06T11:53:12Z",
+ metadata.getDate(PDF.PDFVT_MODIFIED).toInstant().toString());
}
private Metadata extract(String xmpFileName) throws IOException, TikaException, SAXException {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index cb37992bc..7a059a22e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1023,9 +1023,9 @@ public class PDFParserTest extends TikaTest {
"Preflight", "Preflight"}, m.getValues(XMPMM.HISTORY_SOFTWARE_AGENT));
assertArrayEquals(
- new String[]{"2014-03-04T23:50:41Z", "2014-03-04T23:50:42Z", "2014-03-04T23:51:34Z",
- "2014-03-04T23:51:36Z", "2014-03-04T23:51:37Z", "2014-03-04T23:52:22Z",
- "2014-03-04T23:54:48Z"}, m.getValues(XMPMM.HISTORY_WHEN));
+ new String[]{"2014-03-04T22:50:41Z", "2014-03-04T22:50:42Z", "2014-03-04T22:51:34Z",
+ "2014-03-04T22:51:36Z", "2014-03-04T22:51:37Z", "2014-03-04T22:52:22Z",
+ "2014-03-04T22:54:48Z"}, m.getValues(XMPMM.HISTORY_WHEN));
}
@Test
@@ -1296,7 +1296,7 @@ public class PDFParserTest extends TikaTest {
Metadata m = metadataList.get(0);
//these two fields derive from the basic schema in the XMP, not dublin core
assertEquals("Hewlett-Packard MFP", m.get(XMP.CREATOR_TOOL));
- assertEquals("1998-08-29T13:53:15Z", m.get(XMP.CREATE_DATE));
+ assertEquals("1998-08-29T14:53:15Z", m.get(XMP.CREATE_DATE));
}
@Test
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
index c131970b9..f43d83077 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
@@ -134,4 +134,14 @@ public class JempboxExtractorTest extends TikaTest {
}
}
+ @Test
+ public void testModifiedTZ() throws Exception {
+ Metadata m = new Metadata();
+ JempboxExtractor ex = new JempboxExtractor(m);
+ try (InputStream is = getResourceAsStream("/test-documents/testXMP.xmp")) {
+ ex.parse(is);
+ }
+ assertEquals("2014-03-04T22:50:41Z", m.get(XMPMM.HISTORY_WHEN));
+ }
+
}