You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/06/12 15:13:37 UTC

[tika] branch main updated: TIKA-4043 -- fix build issues related to timezone differences and variations of output from Tesseract (#1187)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 32b8cbcf9 TIKA-4043 -- fix build issues related to timezone differences and variations of output from Tesseract (#1187)
32b8cbcf9 is described below

commit 32b8cbcf9e900690e67c832b635f36fc91d2330f
Author: Tim Allison <ta...@apache.org>
AuthorDate: Mon Jun 12 11:13:29 2023 -0400

    TIKA-4043 -- fix build issues related to timezone differences and variations of output from Tesseract (#1187)
---
 .../test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java | 4 +++-
 .../test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java  | 4 ++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
index 1fd53fe34..b00ec300d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
@@ -392,7 +392,9 @@ public class RTFParserTest extends TikaTest {
         assertEquals("1", xml.metadata.get(Office.PAGE_COUNT));
         assertEquals("70", xml.metadata.get(Office.WORD_COUNT));
         assertEquals("401", xml.metadata.get(Office.CHARACTER_COUNT));
-        assertTrue(xml.metadata.get(TikaCoreProperties.CREATED).startsWith("2010-10-13T"));
+        //RTFParser's legacy behavior is to apply local timezone to dates/times.
+        //This needs to be flexible enough to pass in various time-zones TIKA-4043
+        assertTrue(xml.metadata.get(TikaCoreProperties.CREATED).startsWith("2010-10-"));
     }
 
     // TIKA-1192
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 776e3dddb..b2c1cb262 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -105,6 +105,8 @@ public class TesseractOCRParserTest extends TikaTest {
         //tesseract should handle multipage tiffs by itself
         //let's confirm that
         String xml = getXML("testTIFF_multipage.tif", getMetadata(MediaType.image("tiff"))).xml;
+        //TIKA-4043 -- on some OS/versions of tesseract Page?2 is extracted
+        xml = xml.replaceAll("[^A-Za-z0-9]", " ");
         assertContains("Page 2", xml);
     }
 
@@ -117,6 +119,8 @@ public class TesseractOCRParserTest extends TikaTest {
         context.set(TesseractOCRConfig.class, config);
         String xml =
                 getXML("testTIFF_multipage.tif", getMetadata(MediaType.image("tiff")), context).xml;
+        //TIKA-4043 -- on some OS/versions of tesseract Page?2 is extracted
+        xml = xml.replaceAll("[^A-Za-z0-9]", " ");
         assertNotContained("Page 2", xml);
     }