You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/10/14 15:06:50 UTC

[tika] 02/02: TIKA-3570 -- add detection for esri layer files (thanks to Tyler Thorsted for identifying example files).

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 1d934af1be57167528c363ef8cdf02476b6be5f1
Author: tballison <ta...@apache.org>
AuthorDate: Thu Oct 14 11:06:40 2021 -0400

    TIKA-3570 -- add detection for esri layer files (thanks to Tyler Thorsted for identifying example files).
---
 CHANGES.txt                                                    |  5 +++--
 .../src/main/resources/org/apache/tika/mime/tika-mimetypes.xml |  5 +++++
 .../apache/tika/detect/microsoft/POIFSContainerDetector.java   |  6 ++++++
 .../org/apache/tika/detect/TestContainerAwareDetector.java     | 10 ++++++++++
 4 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 936cf77..0de4ae0 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,7 +1,10 @@
 Release 2.1.1 - ???
 
+   * Add detection of ESRI Layer files (TIKA-3570).
+
    * Add detection of JPEG XL, MARC, ICC profiles, NES-ROM file types
      (TIKA-3562 and TIKA-3563)
+
    * Remove duplicate "subject" metadata keys that were intended
      for backwards compatibility within 1.x only (TIKA-3564).
 
@@ -9,8 +12,6 @@ Release 2.1.1 - ???
      and no longer require OPCPackageDetector-last ordering of zip
      detectors (TIKA-3556).
 
-   * Add detection of JPEG XL file types (TIKA-3562)
-
    * Improve robustness and features of the httpfetcher (TIKA-3543)
    
    * Add optional fetch ranges to FetchEmitTuple to allow range fetching from,
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index d85bfd6..fddf970 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3837,6 +3837,11 @@
     </magic>
     <glob pattern="*.zstd"/>
   </mime-type>
+  <mime-type type="application/x-esri-layer">
+    <_comment>ESRI Layer file</_comment>
+    <sub-class-of type="application/x-tika-msoffice"/>
+    <glob pattern="*.lyr"/>
+  </mime-type>
   <mime-type type="application/x-hdf">
     <_comment>Hierarchical Data Format File</_comment>
     <magic priority="50">
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index e739668..50646e3 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -138,6 +138,8 @@ public class POIFSContainerDetector implements Detector {
      */
     public static final MediaType SLDWORKS = application("sldworks");
 
+    public static final MediaType ESRI_LAYER = application("x-esri-layer");
+
     /**
      * Serial version UID
      */
@@ -297,6 +299,10 @@ public class POIFSContainerDetector implements Detector {
             }
         } else if (names.contains("Equation Native")) {
             return MS_EQUATION;
+        } else if (names.contains("Layer")) {
+            //in one test file, also saw LayerSmallImage and LayerLargeImage
+            //maybe add those if we get false positives?
+            return ESRI_LAYER;
         } else {
             for (String name : names) {
                 if (name.startsWith("__substg1.0_")) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index 7f4c4a6..ea5e841 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -32,6 +32,7 @@ import java.util.Random;
 
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.MultiThreadedTikaTest;
@@ -471,6 +472,15 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest {
     }
 
     @Test
+    @Disabled("find acceptable test file")
+    public void testLyr() throws Exception {
+        //file used in development but not added to
+        //repo: https://cmgds.marine.usgs.gov/publications/of2005-1346/arcgis/bathy/Bathymetry.lyr
+        assertTypeByNameAndData("testLyr.lyr", "x-esri-layer",
+                "application/x-esri-layer", "application/x-tika-msoffice");
+    }
+
+    @Test
     public void testCompressOOM() throws Exception {
         assertTypeByData("testZ_oom.Z", "application/x-compress");
     }