You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/06/12 14:17:03 UTC

[tika] branch TIKA-4043 created (now 99f8aa4cd)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4043
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 99f8aa4cd TIKA-4043 -- fix build issues related to timezone differences and variations of output from Tesseract

This branch includes the following new commits:

     new 99f8aa4cd TIKA-4043 -- fix build issues related to timezone differences and variations of output from Tesseract

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-4043 -- fix build issues related to timezone differences and variations of output from Tesseract

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4043
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 99f8aa4cd7cbf6a7ab565473a3a27285a2dd9907
Author: tballison <ta...@apache.org>
AuthorDate: Mon Jun 12 10:16:52 2023 -0400

    TIKA-4043 -- fix build issues related to timezone differences and variations of output from Tesseract
---
 .../test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java | 4 +++-
 .../test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java  | 4 ++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
index 1fd53fe34..b00ec300d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
@@ -392,7 +392,9 @@ public class RTFParserTest extends TikaTest {
         assertEquals("1", xml.metadata.get(Office.PAGE_COUNT));
         assertEquals("70", xml.metadata.get(Office.WORD_COUNT));
         assertEquals("401", xml.metadata.get(Office.CHARACTER_COUNT));
-        assertTrue(xml.metadata.get(TikaCoreProperties.CREATED).startsWith("2010-10-13T"));
+        //RTFParser's legacy behavior is to apply local timezone to dates/times.
+        //This needs to be flexible enough to pass in various time-zones TIKA-4043
+        assertTrue(xml.metadata.get(TikaCoreProperties.CREATED).startsWith("2010-10-"));
     }
 
     // TIKA-1192
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 776e3dddb..b2c1cb262 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -105,6 +105,8 @@ public class TesseractOCRParserTest extends TikaTest {
         //tesseract should handle multipage tiffs by itself
         //let's confirm that
         String xml = getXML("testTIFF_multipage.tif", getMetadata(MediaType.image("tiff"))).xml;
+        //TIKA-4043 -- on some OS/versions of tesseract Page?2 is extracted
+        xml = xml.replaceAll("[^A-Za-z0-9]", " ");
         assertContains("Page 2", xml);
     }
 
@@ -117,6 +119,8 @@ public class TesseractOCRParserTest extends TikaTest {
         context.set(TesseractOCRConfig.class, config);
         String xml =
                 getXML("testTIFF_multipage.tif", getMetadata(MediaType.image("tiff")), context).xml;
+        //TIKA-4043 -- on some OS/versions of tesseract Page?2 is extracted
+        xml = xml.replaceAll("[^A-Za-z0-9]", " ");
         assertNotContained("Page 2", xml);
     }