You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/08/09 16:41:50 UTC
[tika] branch main updated: TIKA-3833 -- improve mime type detection for bz2 files.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new e32c174ef TIKA-3833 -- improve mime type detection for bz2 files.
e32c174ef is described below
commit e32c174effb70e30b7800905f1e86d8efe4dcbec
Author: tallison <ta...@apache.org>
AuthorDate: Tue Aug 9 12:41:36 2022 -0400
TIKA-3833 -- improve mime type detection for bz2 files.
---
.../org/apache/tika/mime/tika-mimetypes.xml | 7 +++----
.../org/apache/tika/detect/MagicDetectorTest.java | 20 ++++++++++++++++++++
.../test-documents/bz2/bzip2-8-file.txt.bz2 | Bin 0 -> 42 bytes
.../resources/test-documents/bz2/empty-file.txt.bz2 | Bin 0 -> 14 bytes
.../test-documents/bz2/lbzip2-8-file.txt.bz2 | Bin 0 -> 41 bytes
.../resources/test-documents/bz2/small-file.txt.bz2 | Bin 0 -> 42 bytes
.../test-documents/bz2/test-file-1.csv.bz2 | Bin 0 -> 299 bytes
.../test-documents/bz2/test-file-2.csv.bz2 | Bin 0 -> 296 bytes
8 files changed, 23 insertions(+), 4 deletions(-)
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index babd241db..f3678ec09 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3506,7 +3506,7 @@
<mime-type type="application/x-bzip">
<magic priority="40">
- <match value="BZh" type="string" offset="0"/>
+ <match value="BZ0" type="string" offset="0"/>
</magic>
<glob pattern="*.bz"/>
<glob pattern="*.tbz"/>
@@ -3515,9 +3515,8 @@
<mime-type type="application/x-bzip2">
<sub-class-of type="application/x-bzip"/>
<_comment>Bzip 2 UNIX Compressed File</_comment>
- <!-- slightly higher than bzip because slightly longer -->
- <magic priority="41">
- <match value="\x42\x5a\x68\x39\x31" type="string" offset="0"/>
+ <magic priority="40">
+ <match value="BZh[1-9]" type="regex" offset="0"/>
</magic>
<glob pattern="*.bz2"/>
<glob pattern="*.tbz2"/>
diff --git a/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java
index d965e2109..3a86a53b3 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java
@@ -22,6 +22,7 @@ import static java.nio.charset.StandardCharsets.UTF_16LE;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.fail;
+import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
@@ -29,6 +30,7 @@ import java.io.InputStream;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -245,4 +247,22 @@ public class MagicDetectorTest {
}
}
+ @Test
+ public void testBZ2Detection() throws Exception {
+ Detector detector = new TikaConfig().getDetector();
+ for (String bz2 : new String[]{"bzip2-8-file.txt.bz2",
+ "empty-file.txt.bz2", "lbzip2-8-file.txt.bz2",
+ "small-file.txt.bz2", "test-file-1.csv.bz2",
+ "test-file-2.csv.bz2"}) {
+ assertEquals("application/x-bzip2", detect(detector, bz2));
+ }
+ }
+
+ private String detect(Detector detector, String bz2Name) throws IOException {
+ try (InputStream is = new BufferedInputStream(
+ this.getClass().getResourceAsStream(
+ "/test-documents/bz2/" + bz2Name))) {
+ return detector.detect(is, new Metadata()).toString();
+ }
+ }
}
diff --git a/tika-core/src/test/resources/test-documents/bz2/bzip2-8-file.txt.bz2 b/tika-core/src/test/resources/test-documents/bz2/bzip2-8-file.txt.bz2
new file mode 100644
index 000000000..aa6dde6e4
Binary files /dev/null and b/tika-core/src/test/resources/test-documents/bz2/bzip2-8-file.txt.bz2 differ
diff --git a/tika-core/src/test/resources/test-documents/bz2/empty-file.txt.bz2 b/tika-core/src/test/resources/test-documents/bz2/empty-file.txt.bz2
new file mode 100644
index 000000000..b56f3b974
Binary files /dev/null and b/tika-core/src/test/resources/test-documents/bz2/empty-file.txt.bz2 differ
diff --git a/tika-core/src/test/resources/test-documents/bz2/lbzip2-8-file.txt.bz2 b/tika-core/src/test/resources/test-documents/bz2/lbzip2-8-file.txt.bz2
new file mode 100644
index 000000000..4d6fe221b
Binary files /dev/null and b/tika-core/src/test/resources/test-documents/bz2/lbzip2-8-file.txt.bz2 differ
diff --git a/tika-core/src/test/resources/test-documents/bz2/small-file.txt.bz2 b/tika-core/src/test/resources/test-documents/bz2/small-file.txt.bz2
new file mode 100644
index 000000000..14cbdbbd6
Binary files /dev/null and b/tika-core/src/test/resources/test-documents/bz2/small-file.txt.bz2 differ
diff --git a/tika-core/src/test/resources/test-documents/bz2/test-file-1.csv.bz2 b/tika-core/src/test/resources/test-documents/bz2/test-file-1.csv.bz2
new file mode 100644
index 000000000..e3ab32b69
Binary files /dev/null and b/tika-core/src/test/resources/test-documents/bz2/test-file-1.csv.bz2 differ
diff --git a/tika-core/src/test/resources/test-documents/bz2/test-file-2.csv.bz2 b/tika-core/src/test/resources/test-documents/bz2/test-file-2.csv.bz2
new file mode 100644
index 000000000..ca04f3e60
Binary files /dev/null and b/tika-core/src/test/resources/test-documents/bz2/test-file-2.csv.bz2 differ