You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/10/14 15:06:50 UTC
[tika] 02/02: TIKA-3570 -- add detection for esri layer files
(thanks to Tyler Thorsted for identifying example files).
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 1d934af1be57167528c363ef8cdf02476b6be5f1
Author: tballison <ta...@apache.org>
AuthorDate: Thu Oct 14 11:06:40 2021 -0400
TIKA-3570 -- add detection for esri layer files (thanks to Tyler Thorsted for identifying example files).
---
CHANGES.txt | 5 +++--
.../src/main/resources/org/apache/tika/mime/tika-mimetypes.xml | 5 +++++
.../apache/tika/detect/microsoft/POIFSContainerDetector.java | 6 ++++++
.../org/apache/tika/detect/TestContainerAwareDetector.java | 10 ++++++++++
4 files changed, 24 insertions(+), 2 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 936cf77..0de4ae0 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,7 +1,10 @@
Release 2.1.1 - ???
+ * Add detection of ESRI Layer files (TIKA-3570).
+
* Add detection of JPEG XL, MARC, ICC profiles, NES-ROM file types
(TIKA-3562 and TIKA-3563)
+
* Remove duplicate "subject" metadata keys that were intended
for backwards compatibility within 1.x only (TIKA-3564).
@@ -9,8 +12,6 @@ Release 2.1.1 - ???
and no longer require OPCPackageDetector-last ordering of zip
detectors (TIKA-3556).
- * Add detection of JPEG XL file types (TIKA-3562)
-
* Improve robustness and features of the httpfetcher (TIKA-3543)
* Add optional fetch ranges to FetchEmitTuple to allow range fetching from,
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index d85bfd6..fddf970 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3837,6 +3837,11 @@
</magic>
<glob pattern="*.zstd"/>
</mime-type>
+ <mime-type type="application/x-esri-layer">
+ <_comment>ESRI Layer file</_comment>
+ <sub-class-of type="application/x-tika-msoffice"/>
+ <glob pattern="*.lyr"/>
+ </mime-type>
<mime-type type="application/x-hdf">
<_comment>Hierarchical Data Format File</_comment>
<magic priority="50">
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index e739668..50646e3 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -138,6 +138,8 @@ public class POIFSContainerDetector implements Detector {
*/
public static final MediaType SLDWORKS = application("sldworks");
+ public static final MediaType ESRI_LAYER = application("x-esri-layer");
+
/**
* Serial version UID
*/
@@ -297,6 +299,10 @@ public class POIFSContainerDetector implements Detector {
}
} else if (names.contains("Equation Native")) {
return MS_EQUATION;
+ } else if (names.contains("Layer")) {
+ //in one test file, also saw LayerSmallImage and LayerLargeImage
+ //maybe add those if we get false positives?
+ return ESRI_LAYER;
} else {
for (String name : names) {
if (name.startsWith("__substg1.0_")) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index 7f4c4a6..ea5e841 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -32,6 +32,7 @@ import java.util.Random;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.apache.tika.MultiThreadedTikaTest;
@@ -471,6 +472,15 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest {
}
@Test
+ @Disabled("find acceptable test file")
+ public void testLyr() throws Exception {
+ //file used in development but not added to
+ //repo: https://cmgds.marine.usgs.gov/publications/of2005-1346/arcgis/bathy/Bathymetry.lyr
+ assertTypeByNameAndData("testLyr.lyr", "x-esri-layer",
+ "application/x-esri-layer", "application/x-tika-msoffice");
+ }
+
+ @Test
public void testCompressOOM() throws Exception {
assertTypeByData("testZ_oom.Z", "application/x-compress");
}