You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/22 16:59:24 UTC

[tika] branch main updated: TIKA-3721 -- add detection of dgn files via Steven Frew's tika-dgn-detector

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new cee0f15df TIKA-3721 -- add detection of dgn files via Steven Frew's tika-dgn-detector
cee0f15df is described below

commit cee0f15df20e811416da7ed57c86c6d8aab89572
Author: tallison <ta...@apache.org>
AuthorDate: Fri Apr 22 12:59:07 2022 -0400

    TIKA-3721 -- add detection of dgn files via Steven Frew's tika-dgn-detector
---
 CHANGES.txt                                        |   3 +++
 .../org/apache/tika/mime/tika-mimetypes.xml        |  21 +++++++++++++++++++++
 .../src/test/resources/test-documents/testDGN7.dgn | Bin 0 -> 33792 bytes
 .../src/test/resources/test-documents/testDGN8.dgn | Bin 0 -> 28160 bytes
 .../detect/microsoft/POIFSContainerDetector.java   |   5 +++++
 .../tika/detect/TestContainerAwareDetector.java    |   1 +
 .../java/org/apache/tika/mime/TestMimeTypes.java   |   7 +++++++
 7 files changed, 37 insertions(+)

diff --git a/CHANGES.txt b/CHANGES.txt
index 2da66e57b..a58dfcd64 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -22,6 +22,9 @@ Release 2.4.0 - ???
 
    * Add detection for Frictionless Data packages and WACZ (TIKA-3696).
 
+   * Add detection for DGN files with gratitude and credit
+     to Steven Frew's tika-dgn-detector (TIKA-3721).
+
    * Add a fetcher and emitter for Azure blob storage (TIKA-3707).
 
    * Add detection for files encrypted by Microsoft's Rights Management Service
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 64f7f8481..a2b734cc8 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5869,6 +5869,27 @@
   </mime-type>
 
   <mime-type type="image/vnd.cns.inf2"/>
+
+  <!-- credit: peeveen's tika-dgn-detector https://github.com/peeveen/tika-dgn-detector/ -->
+  <mime-type type="image/vnd.dgn">
+    <glob pattern="*.dgn" />
+    <glob pattern="*.dgnlib" />
+    <glob pattern="*.cel" />
+  </mime-type>
+  <mime-type type="image/vnd.dgn;version=7">
+    <_comment>MicroStation v7 drawing</_comment>
+    <_comment>Sometimes first byte is C8, sometimes it is 08.</_comment>
+    <magic priority="50">
+      <match value="0x0809FE02" type="string" offset="0" mask="0x0FFFFFFF" />
+    </magic>
+    <sub-class-of type="image/vnd.dgn"/>
+  </mime-type>
+  <mime-type type="image/vnd.dgn;version=8">
+    <_comment>MicroStation v8 drawing; requires ole2 detector</_comment>
+    <alias type="image/vnd.dgn;ver=8"/>
+    <sub-class-of type="application/x-tika-msoffice"/>
+  </mime-type>
+
   <mime-type type="image/vnd.djvu">
     <glob pattern="*.djvu"/>
     <glob pattern="*.djv"/>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDGN7.dgn b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDGN7.dgn
new file mode 100644
index 000000000..504202166
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDGN7.dgn differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDGN8.dgn b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDGN8.dgn
new file mode 100644
index 000000000..097f342c7
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/testDGN8.dgn differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index 4117a03a3..d0571110c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -17,6 +17,7 @@
 package org.apache.tika.detect.microsoft;
 
 import static org.apache.tika.mime.MediaType.application;
+import static org.apache.tika.mime.MediaType.image;
 
 import java.io.IOException;
 import java.io.InputStream;
@@ -147,6 +148,7 @@ public class POIFSContainerDetector implements Detector {
 
     public static final MediaType ESRI_LAYER = application("x-esri-layer");
 
+    public static final MediaType DGN_8 = image("vnd.dgn;version=8");
     /**
      * Serial version UID
      */
@@ -326,6 +328,9 @@ public class POIFSContainerDetector implements Detector {
             //maybe add those if we get false positives?
             //in other test files there was a single entry for "Layer"
             return ESRI_LAYER;
+        } else if (names.contains("Dgn~Mf") && names.contains("Dgn~S") &&
+                names.contains("Dgn~H")) {
+            return DGN_8;
         } else {
             for (String name : names) {
                 if (name.startsWith("__substg1.0_")) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index ea5e841a2..fc8c14371 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -163,6 +163,7 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest {
         // With a filename of a totally different type, data will trump filename
         assertTypeByNameAndData("testEXCEL.xls", "notPDF.pdf", "application/vnd.ms-excel");
         assertTypeByNameAndData("testEXCEL.xls", "notPNG.png", "application/vnd.ms-excel");
+        assertTypeByData("testDGN8.dgn", "image/vnd.dgn; version=8");
     }
 
     /**
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index c4922718d..9cccd08e8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -650,6 +650,13 @@ public class TestMimeTypes {
         // TODO Get a sample Binary DXF file and test
     }
 
+    @Test
+    public void testDGN() throws Exception {
+        assertTypeByName("image/vnd.dgn", "testDGN7.dgn");
+        assertTypeByName("image/vnd.dgn", "testDGN8.dgn");
+        assertTypeByData("image/vnd.dgn; version=7", "testDGN7.dgn");
+    }
+
     @Test
     public void testprtDetection() throws Exception {
         assertTypeByName("application/x-prt", "x.prt");