You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/08/09 16:41:50 UTC

[tika] branch main updated: TIKA-3833 -- improve mime type detection for bz2 files.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new e32c174ef TIKA-3833 -- improve mime type detection for bz2 files.
e32c174ef is described below

commit e32c174effb70e30b7800905f1e86d8efe4dcbec
Author: tallison <ta...@apache.org>
AuthorDate: Tue Aug 9 12:41:36 2022 -0400

    TIKA-3833 -- improve mime type detection for bz2 files.
---
 .../org/apache/tika/mime/tika-mimetypes.xml         |   7 +++----
 .../org/apache/tika/detect/MagicDetectorTest.java   |  20 ++++++++++++++++++++
 .../test-documents/bz2/bzip2-8-file.txt.bz2         | Bin 0 -> 42 bytes
 .../resources/test-documents/bz2/empty-file.txt.bz2 | Bin 0 -> 14 bytes
 .../test-documents/bz2/lbzip2-8-file.txt.bz2        | Bin 0 -> 41 bytes
 .../resources/test-documents/bz2/small-file.txt.bz2 | Bin 0 -> 42 bytes
 .../test-documents/bz2/test-file-1.csv.bz2          | Bin 0 -> 299 bytes
 .../test-documents/bz2/test-file-2.csv.bz2          | Bin 0 -> 296 bytes
 8 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index babd241db..f3678ec09 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3506,7 +3506,7 @@
 
   <mime-type type="application/x-bzip">
     <magic priority="40">
-      <match value="BZh" type="string" offset="0"/>
+      <match value="BZ0" type="string" offset="0"/>
     </magic>
     <glob pattern="*.bz"/>
     <glob pattern="*.tbz"/>
@@ -3515,9 +3515,8 @@
   <mime-type type="application/x-bzip2">
     <sub-class-of type="application/x-bzip"/>
     <_comment>Bzip 2 UNIX Compressed File</_comment>
-    <!-- slightly higher than bzip because slightly longer -->
-    <magic priority="41">
-      <match value="\x42\x5a\x68\x39\x31" type="string" offset="0"/>
+    <magic priority="40">
+      <match value="BZh[1-9]" type="regex" offset="0"/>
     </magic>
     <glob pattern="*.bz2"/>
     <glob pattern="*.tbz2"/>
diff --git a/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java
index d965e2109..3a86a53b3 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java
@@ -22,6 +22,7 @@ import static java.nio.charset.StandardCharsets.UTF_16LE;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.fail;
 
+import java.io.BufferedInputStream;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
@@ -29,6 +30,7 @@ import java.io.InputStream;
 import org.apache.commons.io.IOUtils;
 import org.junit.jupiter.api.Test;
 
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 
@@ -245,4 +247,22 @@ public class MagicDetectorTest {
         }
     }
 
+    @Test
+    public void testBZ2Detection() throws Exception {
+        Detector detector = new TikaConfig().getDetector();
+        for (String bz2 : new String[]{"bzip2-8-file.txt.bz2",
+                "empty-file.txt.bz2", "lbzip2-8-file.txt.bz2",
+                "small-file.txt.bz2", "test-file-1.csv.bz2",
+                "test-file-2.csv.bz2"}) {
+            assertEquals("application/x-bzip2", detect(detector, bz2));
+        }
+    }
+
+    private String detect(Detector detector, String bz2Name) throws IOException  {
+        try (InputStream is = new BufferedInputStream(
+                this.getClass().getResourceAsStream(
+                        "/test-documents/bz2/" + bz2Name))) {
+            return detector.detect(is, new Metadata()).toString();
+        }
+    }
 }
diff --git a/tika-core/src/test/resources/test-documents/bz2/bzip2-8-file.txt.bz2 b/tika-core/src/test/resources/test-documents/bz2/bzip2-8-file.txt.bz2
new file mode 100644
index 000000000..aa6dde6e4
Binary files /dev/null and b/tika-core/src/test/resources/test-documents/bz2/bzip2-8-file.txt.bz2 differ
diff --git a/tika-core/src/test/resources/test-documents/bz2/empty-file.txt.bz2 b/tika-core/src/test/resources/test-documents/bz2/empty-file.txt.bz2
new file mode 100644
index 000000000..b56f3b974
Binary files /dev/null and b/tika-core/src/test/resources/test-documents/bz2/empty-file.txt.bz2 differ
diff --git a/tika-core/src/test/resources/test-documents/bz2/lbzip2-8-file.txt.bz2 b/tika-core/src/test/resources/test-documents/bz2/lbzip2-8-file.txt.bz2
new file mode 100644
index 000000000..4d6fe221b
Binary files /dev/null and b/tika-core/src/test/resources/test-documents/bz2/lbzip2-8-file.txt.bz2 differ
diff --git a/tika-core/src/test/resources/test-documents/bz2/small-file.txt.bz2 b/tika-core/src/test/resources/test-documents/bz2/small-file.txt.bz2
new file mode 100644
index 000000000..14cbdbbd6
Binary files /dev/null and b/tika-core/src/test/resources/test-documents/bz2/small-file.txt.bz2 differ
diff --git a/tika-core/src/test/resources/test-documents/bz2/test-file-1.csv.bz2 b/tika-core/src/test/resources/test-documents/bz2/test-file-1.csv.bz2
new file mode 100644
index 000000000..e3ab32b69
Binary files /dev/null and b/tika-core/src/test/resources/test-documents/bz2/test-file-1.csv.bz2 differ
diff --git a/tika-core/src/test/resources/test-documents/bz2/test-file-2.csv.bz2 b/tika-core/src/test/resources/test-documents/bz2/test-file-2.csv.bz2
new file mode 100644
index 000000000..ca04f3e60
Binary files /dev/null and b/tika-core/src/test/resources/test-documents/bz2/test-file-2.csv.bz2 differ