You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/26 17:56:53 UTC
[tika] 01/01: TIKA-4054 -- add a bunch of mimes via Greg Lepore
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4054
in repository https://gitbox.apache.org/repos/asf/tika.git
commit f6a7b8a7c954dc5a881b28ec32ddeaba3194a037
Author: tballison <ta...@apache.org>
AuthorDate: Fri May 26 13:56:44 2023 -0400
TIKA-4054 -- add a bunch of mimes via Greg Lepore
---
.../org/apache/tika/mime/tika-mimetypes.xml | 116 ++++++++++++++++++++-
.../java/org/apache/tika/mime/OneOffMimeTest.java | 17 +++
2 files changed, 128 insertions(+), 5 deletions(-)
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 5b0a479fe..db90614b3 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3328,7 +3328,19 @@
</magic>
<glob pattern="*.arj"/>
</mime-type>
-
+ <mime-type type="application/x-asprs">
+ <_comment>ASPRS Lidar Data Exchange Format</_comment>
+ <magic priority="50">
+ <match value="LASF" type="string" offset="0">
+ <!-- version 1.1 -->
+ <match value="\x01\x01" type="string" offset="24"/>
+ <!-- version 1.2 -->
+ <match value="\x01\x02" type="string" offset="24"/>
+ </match>
+ </magic>
+ <glob pattern="*.las"/>
+ <glob pattern="*.laz"/>
+ </mime-type>
<mime-type type="application/x-authorware-bin">
<glob pattern="*.aab"/>
<glob pattern="*.x32"/>
@@ -3586,6 +3598,23 @@
<sub-class-of type="application/x-tar"/>
</mime-type>
+ <mime-type type="application/x-amiga-disk-format">
+ <_comment>Amiga Disk File</_comment>
+ <glob pattern="*.adf"/>
+ <magic priority="50">
+ <match value="DOS" offset="0" type="string">
+ <match value="\x00" offset="4" type="string"/>
+ <match value="\x01" offset="4" type="string"/>
+ <match value="\x02" offset="4" type="string"/>
+ <match value="\x03" offset="4" type="string"/>
+ <match value="\x04" offset="4" type="string"/>
+ <match value="\x05" offset="4" type="string"/>
+ <match value="\x06" offset="4" type="string"/>
+ <match value="\x07" offset="4" type="string"/>
+ </match>
+ </magic>
+ </mime-type>
+
<mime-type type="application/x-brotli">
<glob pattern="*.br" />
<glob pattern="*.brotli" />
@@ -3852,10 +3881,12 @@
<mime-type type="application/x-font-dos"/>
<mime-type type="application/x-font-framemaker"/>
<mime-type type="application/x-font-ghostscript">
+ <!-- conflict with portable sound format -->
<glob pattern="*.gsf"/>
</mime-type>
<mime-type type="application/x-font-libgrx"/>
<mime-type type="application/x-font-linux-psf">
+ <!-- conflict with portable sound format -->
<glob pattern="*.psf"/>
</mime-type>
@@ -4086,7 +4117,16 @@
<mime-type type="application/x-java-pack200">
<glob pattern="*.pack"/>
</mime-type>
-
+ <mime-type type="application/x-jeol-jdf">
+ <_comment>JDF NMR Spectroscopy</_comment>
+ <glob pattern="*.jdf"/>
+ <magic priority="50">
+ <!-- big endian -->
+ <match value="JEOL.NMR" offset="0" type="string"/>
+ <!-- little endian -->
+ <match value="RMN.LOEJ" offset="0" type="string"/>
+ </magic>
+ </mime-type>
<mime-type type="application/x-kdelnk">
<magic priority="50">
<match value="[KDE\ Desktop\ Entry]" type="string" offset="0"/>
@@ -4617,7 +4657,13 @@
<match value="HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!" offset="0" />
</magic>
</mime-type>
-
+ <mime-type type="application/x-spss-sav">
+ <_comment>SPSS Data File</_comment>
+ <glob pattern="*.sav"/>
+ <magic priority="50">
+ <match value="$FL2@(#)" offset="0" />
+ </magic>
+ </mime-type>
<mime-type type="application/x-sc">
<magic priority="50">
<match value="Spreadsheet" type="string" offset="38"/>
@@ -4659,7 +4705,13 @@
</magic>
<glob pattern="*.swf"/>
</mime-type>
-
+ <mime-type type="application/x-sibelius">
+ <_comment>Sibelius</_comment>
+ <magic priority="50">
+ <match value="\x0FSIBELIUS" type="string" offset="0"/>
+ </magic>
+ <glob pattern="*.sib"/>
+ </mime-type>
<mime-type type="application/x-silverlight-app">
<glob pattern="*.xap"/>
</mime-type>
@@ -4678,6 +4730,14 @@
<glob pattern="*.sfdu"/>
</mime-type>
+ <mime-type type="application/x-spectrum-tzx">
+ <_comment>TAP (ZX Spectrum)</_comment>
+ <magic priority="50">
+ <match value="ZXTape!\x1a" type="string" offset="0"/>
+ </magic>
+ <glob pattern="*.tzx"/>
+ </mime-type>
+
<mime-type type="application/x-sqlite3">
<magic priority="50">
<match value="SQLite format 3\x00" type="string" offset="0"/>
@@ -5362,6 +5422,42 @@
<glob pattern="*.ogg"/>
<sub-class-of type="audio/ogg"/>
</mime-type>
+ <mime-type type="audio/x-psf">
+ <_comment>Portable Sound Format</_comment>
+ <tika:link>http://web.archive.org/web/20140125155137/http://wiki.neillcorlett.com/PSFFormat</tika:link>
+ <magic priority="50">
+ <match value="PSF" type="string" offset="0">
+ <!-- Playstation (PSF1) -->
+ <match value="\x01" type="string" offset="3"/>
+ <!-- Playstation 2 (PSF2) -->
+ <match value="\x02" type="string" offset="3"/>
+ <!-- Sega Saturn -->
+ <match value="\x11" type="string" offset="3"/>
+ <!-- Sega Dreamcast -->
+ <match value="\x12" type="string" offset="3"/>
+ <!-- Sega Genesis -->
+ <match value="\x13" type="string" offset="3"/>
+ <!-- Nintendo 64-->
+ <match value="\x21" type="string" offset="3"/>
+ <!-- GameBoy Advance -->
+ <match value="\x22" type="string" offset="3"/>
+ <!-- Super NES -->
+ <match value="\x23" type="string" offset="3"/>
+ <!-- Capcom QSound -->
+ <match value="\x41" type="string" offset="3"/>
+ </match>
+ </magic>
+ <!-- conflict with application/x-font-linux-psf
+ <glob pattern="*.psf"/>-->
+ <glob pattern="*.psf1"/>
+ <glob pattern="*.psflib"/>
+ <glob pattern="*.minipsf"/>
+ <glob pattern="*.minipsf1"/>
+ <!-- conflict with application/x-font-ghostscript
+ <glob pattern="*.gsf"/> -->
+ <glob pattern="*.gslib"/>
+ <glob pattern="*.minigsf"/>
+ </mime-type>
<mime-type type="audio/x-sap">
<_comment>Slight Atari Player</_comment>
<tika:link>https://asap.sourceforge.net/sap-format.html</tika:link>
@@ -5670,7 +5766,17 @@
<mime-type type="chemical/x-xyz">
<glob pattern="*.xyz"/>
</mime-type>
-
+ <mime-type type="image/x-3ds">
+ <_comment>3D Studio (V1)</_comment>
+ <magic priority="50">
+ <match value="MM" type="string" offset="0">
+ <match value="\x02\x00\x0A\x00\x00\x00" type="string" offset="6">
+ <match value="==" type="string" offset="16"/>
+ </match>
+ </match>
+ </magic>
+ <glob pattern="*.3ds"/>
+ </mime-type>
<mime-type type="image/aces">
<_comment>ACES Image Container File</_comment>
<magic priority="50">
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/OneOffMimeTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/OneOffMimeTest.java
index 45c491639..fb0ad4b54 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/OneOffMimeTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/OneOffMimeTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.mime;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import java.io.File;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -28,6 +29,7 @@ import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
+import org.apache.tika.Tika;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -46,6 +48,21 @@ public class OneOffMimeTest extends TikaTest {
assertByName(mime, p);
}
+ @Test
+ @Disabled("again for development purposes with files that aren't suitable for the repo")
+ public void testDir() throws Exception {
+ Path root = Paths.get("");
+ Tika tika = new Tika();
+ for (File f : root.toFile().listFiles()) {
+ String fileMime = tika.detect(f);
+ String streamMime = "";
+ try (InputStream is = Files.newInputStream(f.toPath())) {
+ streamMime = tika.detect(is);
+ }
+ System.out.println(f.getName() + " fileMime=" + fileMime + " stream=" + streamMime);
+ }
+ }
+
private void assertByName(String expected, Path p) throws Exception {
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, p.getFileName().toString());