You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/10/14 15:06:48 UTC

[tika] branch main updated (415caf2 -> 1d934af)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 415caf2  TIKA-3573 -- remove check
     new c8b19b0  alphabetize parameters
     new 1d934af  TIKA-3570 -- add detection for esri layer files (thanks to Tyler Thorsted for identifying example files).

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |  5 ++-
 .../org/apache/tika/mime/tika-mimetypes.xml        |  5 +++
 .../detect/microsoft/POIFSContainerDetector.java   |  6 +++
 .../apache/tika/parser/ocr/TesseractOCRConfig.java | 45 +++++++++++-----------
 .../tika/detect/TestContainerAwareDetector.java    | 10 +++++
 5 files changed, 47 insertions(+), 24 deletions(-)

[tika] 02/02: TIKA-3570 -- add detection for esri layer files (thanks to Tyler Thorsted for identifying example files).

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 1d934af1be57167528c363ef8cdf02476b6be5f1
Author: tballison <ta...@apache.org>
AuthorDate: Thu Oct 14 11:06:40 2021 -0400

    TIKA-3570 -- add detection for esri layer files (thanks to Tyler Thorsted for identifying example files).
---
 CHANGES.txt                                                    |  5 +++--
 .../src/main/resources/org/apache/tika/mime/tika-mimetypes.xml |  5 +++++
 .../apache/tika/detect/microsoft/POIFSContainerDetector.java   |  6 ++++++
 .../org/apache/tika/detect/TestContainerAwareDetector.java     | 10 ++++++++++
 4 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 936cf77..0de4ae0 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,7 +1,10 @@
 Release 2.1.1 - ???
 
+   * Add detection of ESRI Layer files (TIKA-3570).
+
    * Add detection of JPEG XL, MARC, ICC profiles, NES-ROM file types
      (TIKA-3562 and TIKA-3563)
+
    * Remove duplicate "subject" metadata keys that were intended
      for backwards compatibility within 1.x only (TIKA-3564).
 
@@ -9,8 +12,6 @@ Release 2.1.1 - ???
      and no longer require OPCPackageDetector-last ordering of zip
      detectors (TIKA-3556).
 
-   * Add detection of JPEG XL file types (TIKA-3562)
-
    * Improve robustness and features of the httpfetcher (TIKA-3543)
    
    * Add optional fetch ranges to FetchEmitTuple to allow range fetching from,
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index d85bfd6..fddf970 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3837,6 +3837,11 @@
     </magic>
     <glob pattern="*.zstd"/>
   </mime-type>
+  <mime-type type="application/x-esri-layer">
+    <_comment>ESRI Layer file</_comment>
+    <sub-class-of type="application/x-tika-msoffice"/>
+    <glob pattern="*.lyr"/>
+  </mime-type>
   <mime-type type="application/x-hdf">
     <_comment>Hierarchical Data Format File</_comment>
     <magic priority="50">
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index e739668..50646e3 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -138,6 +138,8 @@ public class POIFSContainerDetector implements Detector {
      */
     public static final MediaType SLDWORKS = application("sldworks");
 
+    public static final MediaType ESRI_LAYER = application("x-esri-layer");
+
     /**
      * Serial version UID
      */
@@ -297,6 +299,10 @@ public class POIFSContainerDetector implements Detector {
             }
         } else if (names.contains("Equation Native")) {
             return MS_EQUATION;
+        } else if (names.contains("Layer")) {
+            //in one test file, also saw LayerSmallImage and LayerLargeImage
+            //maybe add those if we get false positives?
+            return ESRI_LAYER;
         } else {
             for (String name : names) {
                 if (name.startsWith("__substg1.0_")) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index 7f4c4a6..ea5e841 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -32,6 +32,7 @@ import java.util.Random;
 
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.MultiThreadedTikaTest;
@@ -471,6 +472,15 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest {
     }
 
     @Test
+    @Disabled("find acceptable test file")
+    public void testLyr() throws Exception {
+        //file used in development but not added to
+        //repo: https://cmgds.marine.usgs.gov/publications/of2005-1346/arcgis/bathy/Bathymetry.lyr
+        assertTypeByNameAndData("testLyr.lyr", "x-esri-layer",
+                "application/x-esri-layer", "application/x-tika-msoffice");
+    }
+
+    @Test
     public void testCompressOOM() throws Exception {
         assertTypeByData("testZ_oom.Z", "application/x-compress");
     }

[tika] 01/02: alphabetize parameters

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit c8b19b0dd9178d193402dd46155f3052a9b71830
Author: tballison <ta...@apache.org>
AuthorDate: Thu Oct 14 10:49:15 2021 -0400

    alphabetize parameters
---
 .../apache/tika/parser/ocr/TesseractOCRConfig.java | 45 +++++++++++-----------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 165569e..1835973 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -58,40 +58,41 @@ public class TesseractOCRConfig implements Serializable {
 
     private static Pattern ALLOWABLE_OTHER_PARAMS_PATTERN =
             Pattern.compile("(?i)^[-_/\\.A-Z0-9]+$");
-    // Language dictionary to be used.
-    private String language = "eng";
-    // Tesseract page segmentation mode.
-    private String pageSegMode = "1";
-    // Minimum file size to submit file to ocr.
-    private long minFileSizeToOcr = 0;
-    // Maximum file size to submit file to ocr.
-    private long maxFileSizeToOcr = Integer.MAX_VALUE;
-    // Maximum time (seconds) to wait for the ocring process termination
-    private int timeoutSeconds = 120;
-    // The format of the ocr'ed output to be returned, txt or hocr.
-    private OUTPUT_TYPE outputType = OUTPUT_TYPE.TXT;
-    // enable image preprocessing with ImageMagick (optional)
-    private boolean enableImagePreprocessing = false;
+
+    // whether or not to apply rotation calculated by the rotation.py script
+    private boolean applyRotation = false;
+    // colorspace of processed image.
+    private String colorspace = "gray";
     // resolution of processed image (in dpi).
     private int density = 300;
     // number of bits in a color sample within a pixel.
     private int depth = 4;
-    // colorspace of processed image.
-    private String colorspace = "gray";
+    // enable image preprocessing with ImageMagick (optional)
+    private boolean enableImagePreprocessing = false;
     // filter to be applied to the processed image.
     private String filter = "triangle";
-    // factor by which image is to be scaled.
-    // TODO: we should make this dynamic depending on the size of the image
-    // The current testRotation.png takes minutes to expand 900%
-    private int resize = 200;
+    // Language dictionary to be used.
+    private String language = "eng";
+    // Maximum file size to submit file to ocr.
+    private long maxFileSizeToOcr = Integer.MAX_VALUE;
+    // Minimum file size to submit file to ocr.
+    private long minFileSizeToOcr = 0;
+    // The format of the ocr'ed output to be returned, txt or hocr.
+    private OUTPUT_TYPE outputType = OUTPUT_TYPE.TXT;
+    // Tesseract page segmentation mode.
+    private String pageSegMode = "1";
     // See setPageSeparator.
     private String pageSeparator = "";
     // whether or not to preserve interword spacing
     private boolean preserveInterwordSpacing = false;
-    // whether or not to apply rotation calculated by the rotation.py script
-    private boolean applyRotation = false;
+    // factor by which image is to be scaled.
+    // TODO: we should make this dynamic depending on the size of the image
+    // The current testRotation.png takes minutes to expand 900%
+    private int resize = 200;
     // runtime switch to turn off OCR
     private boolean skipOcr = false;
+    // Maximum time (seconds) to wait for the ocring process termination
+    private int timeoutSeconds = 120;
     // See addOtherTesseractConfig.
     private Map<String, String> otherTesseractConfig = new HashMap<>();
     private Set<String> userConfigured = new HashSet<>();