You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/26 17:56:53 UTC

[tika] 01/01: TIKA-4054 -- add a bunch of mimes via Greg Lepore

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4054
in repository https://gitbox.apache.org/repos/asf/tika.git

commit f6a7b8a7c954dc5a881b28ec32ddeaba3194a037
Author: tballison <ta...@apache.org>
AuthorDate: Fri May 26 13:56:44 2023 -0400

    TIKA-4054 -- add a bunch of mimes via Greg Lepore
---
 .../org/apache/tika/mime/tika-mimetypes.xml        | 116 ++++++++++++++++++++-
 .../java/org/apache/tika/mime/OneOffMimeTest.java  |  17 +++
 2 files changed, 128 insertions(+), 5 deletions(-)

diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 5b0a479fe..db90614b3 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3328,7 +3328,19 @@
     </magic>
     <glob pattern="*.arj"/>
   </mime-type>
-
+  <mime-type type="application/x-asprs">
+    <_comment>ASPRS Lidar Data Exchange Format</_comment>
+    <magic priority="50">
+      <match value="LASF" type="string" offset="0">
+        <!-- version 1.1 -->
+        <match value="\x01\x01" type="string" offset="24"/>
+        <!-- version 1.2 -->
+        <match value="\x01\x02" type="string" offset="24"/>
+      </match>
+    </magic>
+    <glob pattern="*.las"/>
+    <glob pattern="*.laz"/>
+  </mime-type>
   <mime-type type="application/x-authorware-bin">
     <glob pattern="*.aab"/>
     <glob pattern="*.x32"/>
@@ -3586,6 +3598,23 @@
     <sub-class-of type="application/x-tar"/>
   </mime-type>
 
+  <mime-type type="application/x-amiga-disk-format">
+    <_comment>Amiga Disk File</_comment>
+    <glob pattern="*.adf"/>
+    <magic priority="50">
+      <match value="DOS" offset="0" type="string">
+        <match value="\x00" offset="4" type="string"/>
+        <match value="\x01" offset="4" type="string"/>
+        <match value="\x02" offset="4" type="string"/>
+        <match value="\x03" offset="4" type="string"/>
+        <match value="\x04" offset="4" type="string"/>
+        <match value="\x05" offset="4" type="string"/>
+        <match value="\x06" offset="4" type="string"/>
+        <match value="\x07" offset="4" type="string"/>
+      </match>
+    </magic>
+  </mime-type>
+
   <mime-type type="application/x-brotli">
     <glob pattern="*.br" />
     <glob pattern="*.brotli" />
@@ -3852,10 +3881,12 @@
   <mime-type type="application/x-font-dos"/>
   <mime-type type="application/x-font-framemaker"/>
   <mime-type type="application/x-font-ghostscript">
+    <!-- conflict with portable sound format -->
     <glob pattern="*.gsf"/>
   </mime-type>
   <mime-type type="application/x-font-libgrx"/>
   <mime-type type="application/x-font-linux-psf">
+    <!-- conflict with portable sound format -->
     <glob pattern="*.psf"/>
   </mime-type>
 
@@ -4086,7 +4117,16 @@
   <mime-type type="application/x-java-pack200">
     <glob pattern="*.pack"/>
   </mime-type>
-
+  <mime-type type="application/x-jeol-jdf">
+    <_comment>JDF NMR Spectroscopy</_comment>
+    <glob pattern="*.jdf"/>
+    <magic priority="50">
+      <!-- big endian -->
+      <match value="JEOL.NMR" offset="0" type="string"/>
+      <!-- little endian -->
+      <match value="RMN.LOEJ" offset="0" type="string"/>
+    </magic>
+  </mime-type>
   <mime-type type="application/x-kdelnk">
     <magic priority="50">
       <match value="[KDE\ Desktop\ Entry]" type="string" offset="0"/>
@@ -4617,7 +4657,13 @@
       <match value="HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!" offset="0" />
     </magic>
   </mime-type>
-
+  <mime-type type="application/x-spss-sav">
+    <_comment>SPSS Data File</_comment>
+    <glob pattern="*.sav"/>
+    <magic priority="50">
+      <match value="$FL2@(#)" offset="0" />
+    </magic>
+  </mime-type>
   <mime-type type="application/x-sc">
     <magic priority="50">
       <match value="Spreadsheet" type="string" offset="38"/>
@@ -4659,7 +4705,13 @@
     </magic>
     <glob pattern="*.swf"/>
   </mime-type>
-
+  <mime-type type="application/x-sibelius">
+    <_comment>Sibelius</_comment>
+    <magic priority="50">
+      <match value="\x0FSIBELIUS" type="string" offset="0"/>
+    </magic>
+    <glob pattern="*.sib"/>
+  </mime-type>
   <mime-type type="application/x-silverlight-app">
     <glob pattern="*.xap"/>
   </mime-type>
@@ -4678,6 +4730,14 @@
     <glob pattern="*.sfdu"/>
   </mime-type>
 
+  <mime-type type="application/x-spectrum-tzx">
+    <_comment>TAP (ZX Spectrum)</_comment>
+    <magic priority="50">
+      <match value="ZXTape!\x1a" type="string" offset="0"/>
+    </magic>
+    <glob pattern="*.tzx"/>
+  </mime-type>
+
   <mime-type type="application/x-sqlite3">
     <magic priority="50">
       <match value="SQLite format 3\x00" type="string" offset="0"/>
@@ -5362,6 +5422,42 @@
     <glob pattern="*.ogg"/>
     <sub-class-of type="audio/ogg"/>
   </mime-type>
+  <mime-type type="audio/x-psf">
+    <_comment>Portable Sound Format</_comment>
+    <tika:link>http://web.archive.org/web/20140125155137/http://wiki.neillcorlett.com/PSFFormat</tika:link>
+    <magic priority="50">
+      <match value="PSF" type="string" offset="0">
+        <!-- Playstation (PSF1) -->
+        <match value="\x01" type="string" offset="3"/>
+        <!-- Playstation 2 (PSF2) -->
+        <match value="\x02" type="string" offset="3"/>
+        <!-- Sega Saturn -->
+        <match value="\x11" type="string" offset="3"/>
+        <!-- Sega Dreamcast -->
+        <match value="\x12" type="string" offset="3"/>
+        <!-- Sega Genesis -->
+        <match value="\x13" type="string" offset="3"/>
+        <!-- Nintendo 64-->
+        <match value="\x21" type="string" offset="3"/>
+        <!-- GameBoy Advance -->
+        <match value="\x22" type="string" offset="3"/>
+        <!-- Super NES -->
+        <match value="\x23" type="string" offset="3"/>
+        <!-- Capcom QSound -->
+        <match value="\x41" type="string" offset="3"/>
+      </match>
+    </magic>
+    <!-- conflict with application/x-font-linux-psf
+    <glob pattern="*.psf"/>-->
+    <glob pattern="*.psf1"/>
+    <glob pattern="*.psflib"/>
+    <glob pattern="*.minipsf"/>
+    <glob pattern="*.minipsf1"/>
+    <!-- conflict with application/x-font-ghostscript
+    <glob pattern="*.gsf"/> -->
+    <glob pattern="*.gslib"/>
+    <glob pattern="*.minigsf"/>
+  </mime-type>
   <mime-type type="audio/x-sap">
     <_comment>Slight Atari Player</_comment>
     <tika:link>https://asap.sourceforge.net/sap-format.html</tika:link>
@@ -5670,7 +5766,17 @@
   <mime-type type="chemical/x-xyz">
     <glob pattern="*.xyz"/>
   </mime-type>
-
+  <mime-type type="image/x-3ds">
+    <_comment>3D Studio (V1)</_comment>
+    <magic priority="50">
+      <match value="MM" type="string" offset="0">
+        <match value="\x02\x00\x0A\x00\x00\x00" type="string" offset="6">
+          <match value="==" type="string" offset="16"/>
+        </match>
+      </match>
+    </magic>
+    <glob pattern="*.3ds"/>
+  </mime-type>
   <mime-type type="image/aces">
     <_comment>ACES Image Container File</_comment>
     <magic priority="50">
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/OneOffMimeTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/OneOffMimeTest.java
index 45c491639..fb0ad4b54 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/OneOffMimeTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/OneOffMimeTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.mime;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
+import java.io.File;
 import java.io.InputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -28,6 +29,7 @@ import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
+import org.apache.tika.Tika;
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -46,6 +48,21 @@ public class OneOffMimeTest extends TikaTest {
         assertByName(mime, p);
     }
 
+    @Test
+    @Disabled("again for development purposes with files that aren't suitable for the repo")
+    public void testDir() throws Exception {
+        Path root = Paths.get("");
+        Tika tika = new Tika();
+        for (File f : root.toFile().listFiles()) {
+            String fileMime = tika.detect(f);
+            String streamMime = "";
+            try (InputStream is = Files.newInputStream(f.toPath())) {
+                streamMime = tika.detect(is);
+            }
+            System.out.println(f.getName() + " fileMime=" + fileMime + " stream=" + streamMime);
+        }
+    }
+
     private void assertByName(String expected, Path p) throws Exception {
         Metadata metadata = new Metadata();
         metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, p.getFileName().toString());