You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/26 17:56:52 UTC

[tika] branch TIKA-4054 created (now f6a7b8a7c)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4054
in repository https://gitbox.apache.org/repos/asf/tika.git


      at f6a7b8a7c TIKA-4054 -- add a bunch of mimes via Greg Lepore

This branch includes the following new commits:

     new f6a7b8a7c TIKA-4054 -- add a bunch of mimes via Greg Lepore

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-4054 -- add a bunch of mimes via Greg Lepore

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4054
in repository https://gitbox.apache.org/repos/asf/tika.git

commit f6a7b8a7c954dc5a881b28ec32ddeaba3194a037
Author: tballison <ta...@apache.org>
AuthorDate: Fri May 26 13:56:44 2023 -0400

    TIKA-4054 -- add a bunch of mimes via Greg Lepore
---
 .../org/apache/tika/mime/tika-mimetypes.xml        | 116 ++++++++++++++++++++-
 .../java/org/apache/tika/mime/OneOffMimeTest.java  |  17 +++
 2 files changed, 128 insertions(+), 5 deletions(-)

diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 5b0a479fe..db90614b3 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3328,7 +3328,19 @@
     </magic>
     <glob pattern="*.arj"/>
   </mime-type>
-
+  <mime-type type="application/x-asprs">
+    <_comment>ASPRS Lidar Data Exchange Format</_comment>
+    <magic priority="50">
+      <match value="LASF" type="string" offset="0">
+        <!-- version 1.1 -->
+        <match value="\x01\x01" type="string" offset="24"/>
+        <!-- version 1.2 -->
+        <match value="\x01\x02" type="string" offset="24"/>
+      </match>
+    </magic>
+    <glob pattern="*.las"/>
+    <glob pattern="*.laz"/>
+  </mime-type>
   <mime-type type="application/x-authorware-bin">
     <glob pattern="*.aab"/>
     <glob pattern="*.x32"/>
@@ -3586,6 +3598,23 @@
     <sub-class-of type="application/x-tar"/>
   </mime-type>
 
+  <mime-type type="application/x-amiga-disk-format">
+    <_comment>Amiga Disk File</_comment>
+    <glob pattern="*.adf"/>
+    <magic priority="50">
+      <match value="DOS" offset="0" type="string">
+        <match value="\x00" offset="4" type="string"/>
+        <match value="\x01" offset="4" type="string"/>
+        <match value="\x02" offset="4" type="string"/>
+        <match value="\x03" offset="4" type="string"/>
+        <match value="\x04" offset="4" type="string"/>
+        <match value="\x05" offset="4" type="string"/>
+        <match value="\x06" offset="4" type="string"/>
+        <match value="\x07" offset="4" type="string"/>
+      </match>
+    </magic>
+  </mime-type>
+
   <mime-type type="application/x-brotli">
     <glob pattern="*.br" />
     <glob pattern="*.brotli" />
@@ -3852,10 +3881,12 @@
   <mime-type type="application/x-font-dos"/>
   <mime-type type="application/x-font-framemaker"/>
   <mime-type type="application/x-font-ghostscript">
+    <!-- conflict with portable sound format -->
     <glob pattern="*.gsf"/>
   </mime-type>
   <mime-type type="application/x-font-libgrx"/>
   <mime-type type="application/x-font-linux-psf">
+    <!-- conflict with portable sound format -->
     <glob pattern="*.psf"/>
   </mime-type>
 
@@ -4086,7 +4117,16 @@
   <mime-type type="application/x-java-pack200">
     <glob pattern="*.pack"/>
   </mime-type>
-
+  <mime-type type="application/x-jeol-jdf">
+    <_comment>JDF NMR Spectroscopy</_comment>
+    <glob pattern="*.jdf"/>
+    <magic priority="50">
+      <!-- big endian -->
+      <match value="JEOL.NMR" offset="0" type="string"/>
+      <!-- little endian -->
+      <match value="RMN.LOEJ" offset="0" type="string"/>
+    </magic>
+  </mime-type>
   <mime-type type="application/x-kdelnk">
     <magic priority="50">
       <match value="[KDE\ Desktop\ Entry]" type="string" offset="0"/>
@@ -4617,7 +4657,13 @@
       <match value="HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!" offset="0" />
     </magic>
   </mime-type>
-
+  <mime-type type="application/x-spss-sav">
+    <_comment>SPSS Data File</_comment>
+    <glob pattern="*.sav"/>
+    <magic priority="50">
+      <match value="$FL2@(#)" offset="0" />
+    </magic>
+  </mime-type>
   <mime-type type="application/x-sc">
     <magic priority="50">
       <match value="Spreadsheet" type="string" offset="38"/>
@@ -4659,7 +4705,13 @@
     </magic>
     <glob pattern="*.swf"/>
   </mime-type>
-
+  <mime-type type="application/x-sibelius">
+    <_comment>Sibelius</_comment>
+    <magic priority="50">
+      <match value="\x0FSIBELIUS" type="string" offset="0"/>
+    </magic>
+    <glob pattern="*.sib"/>
+  </mime-type>
   <mime-type type="application/x-silverlight-app">
     <glob pattern="*.xap"/>
   </mime-type>
@@ -4678,6 +4730,14 @@
     <glob pattern="*.sfdu"/>
   </mime-type>
 
+  <mime-type type="application/x-spectrum-tzx">
+    <_comment>TAP (ZX Spectrum)</_comment>
+    <magic priority="50">
+      <match value="ZXTape!\x1a" type="string" offset="0"/>
+    </magic>
+    <glob pattern="*.tzx"/>
+  </mime-type>
+
   <mime-type type="application/x-sqlite3">
     <magic priority="50">
       <match value="SQLite format 3\x00" type="string" offset="0"/>
@@ -5362,6 +5422,42 @@
     <glob pattern="*.ogg"/>
     <sub-class-of type="audio/ogg"/>
   </mime-type>
+  <mime-type type="audio/x-psf">
+    <_comment>Portable Sound Format</_comment>
+    <tika:link>http://web.archive.org/web/20140125155137/http://wiki.neillcorlett.com/PSFFormat</tika:link>
+    <magic priority="50">
+      <match value="PSF" type="string" offset="0">
+        <!-- Playstation (PSF1) -->
+        <match value="\x01" type="string" offset="3"/>
+        <!-- Playstation 2 (PSF2) -->
+        <match value="\x02" type="string" offset="3"/>
+        <!-- Sega Saturn -->
+        <match value="\x11" type="string" offset="3"/>
+        <!-- Sega Dreamcast -->
+        <match value="\x12" type="string" offset="3"/>
+        <!-- Sega Genesis -->
+        <match value="\x13" type="string" offset="3"/>
+        <!-- Nintendo 64-->
+        <match value="\x21" type="string" offset="3"/>
+        <!-- GameBoy Advance -->
+        <match value="\x22" type="string" offset="3"/>
+        <!-- Super NES -->
+        <match value="\x23" type="string" offset="3"/>
+        <!-- Capcom QSound -->
+        <match value="\x41" type="string" offset="3"/>
+      </match>
+    </magic>
+    <!-- conflict with application/x-font-linux-psf
+    <glob pattern="*.psf"/>-->
+    <glob pattern="*.psf1"/>
+    <glob pattern="*.psflib"/>
+    <glob pattern="*.minipsf"/>
+    <glob pattern="*.minipsf1"/>
+    <!-- conflict with application/x-font-ghostscript
+    <glob pattern="*.gsf"/> -->
+    <glob pattern="*.gslib"/>
+    <glob pattern="*.minigsf"/>
+  </mime-type>
   <mime-type type="audio/x-sap">
     <_comment>Slight Atari Player</_comment>
     <tika:link>https://asap.sourceforge.net/sap-format.html</tika:link>
@@ -5670,7 +5766,17 @@
   <mime-type type="chemical/x-xyz">
     <glob pattern="*.xyz"/>
   </mime-type>
-
+  <mime-type type="image/x-3ds">
+    <_comment>3D Studio (V1)</_comment>
+    <magic priority="50">
+      <match value="MM" type="string" offset="0">
+        <match value="\x02\x00\x0A\x00\x00\x00" type="string" offset="6">
+          <match value="==" type="string" offset="16"/>
+        </match>
+      </match>
+    </magic>
+    <glob pattern="*.3ds"/>
+  </mime-type>
   <mime-type type="image/aces">
     <_comment>ACES Image Container File</_comment>
     <magic priority="50">
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/OneOffMimeTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/OneOffMimeTest.java
index 45c491639..fb0ad4b54 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/OneOffMimeTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/OneOffMimeTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.mime;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
+import java.io.File;
 import java.io.InputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -28,6 +29,7 @@ import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
+import org.apache.tika.Tika;
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -46,6 +48,21 @@ public class OneOffMimeTest extends TikaTest {
         assertByName(mime, p);
     }
 
+    @Test
+    @Disabled("again for development purposes with files that aren't suitable for the repo")
+    public void testDir() throws Exception {
+        Path root = Paths.get("");
+        Tika tika = new Tika();
+        for (File f : root.toFile().listFiles()) {
+            String fileMime = tika.detect(f);
+            String streamMime = "";
+            try (InputStream is = Files.newInputStream(f.toPath())) {
+                streamMime = tika.detect(is);
+            }
+            System.out.println(f.getName() + " fileMime=" + fileMime + " stream=" + streamMime);
+        }
+    }
+
     private void assertByName(String expected, Path p) throws Exception {
         Metadata metadata = new Metadata();
         metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, p.getFileName().toString());