You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/03/07 19:55:18 UTC
[tika] branch master updated: TIKA-2591 -- Add workaround to
identify TIFFs that might confuse commons-compress's tar detection via
Daniel Schmidt
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 462ee47 TIKA-2591 -- Add workaround to identify TIFFs that might confuse commons-compress's tar detection via Daniel Schmidt
462ee47 is described below
commit 462ee4744fd426cfdb12539435627b25e789c912
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Mar 7 14:55:06 2018 -0500
TIKA-2591 -- Add workaround to identify TIFFs that might confuse commons-compress's tar detection via Daniel Schmidt
---
CHANGES.txt | 4 ++
.../tika/parser/pkg/ZipContainerDetector.java | 42 ++++++++++++++++-
.../tika/parser/pkg/ZipContainerDetectorTest.java | 55 ++++++++++++++++++++++
3 files changed, 99 insertions(+), 2 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 3f0f31a..71cb60b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -5,6 +5,10 @@ Release 2.0.0 - ???
Other changes
+ * Add workaround to identify TIFFs that might confuse
+ commons-compress's tar detection via Daniel Schmidt
+ (TIKA-2591)
+
* Ignore non-IANA supported charsets in HTML meta-headers
during charset detection in HTMLEncodingDetector
via Andreas Meier (TIKA-2592)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 65e2e1d..c453617 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -21,6 +21,7 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Iterator;
@@ -56,6 +57,19 @@ import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
* formats to figure out exactly what the file is.
*/
public class ZipContainerDetector implements Detector {
+
+ //Regrettably, some tiff files can be incorrectly identified
+ //as tar files. We need this ugly workaround to rule out TIFF.
+ //If commons-compress ever chooses to take over TIFF detection
+ //we can remove all of this. See TIKA-2591.
+ private final static MediaType TIFF = MediaType.image("tiff");
+ private final static byte[][] TIFF_SIGNATURES = new byte[3][];
+ static {
+ TIFF_SIGNATURES[0] = new byte[]{'M','M',0x00,0x2a};
+ TIFF_SIGNATURES[1] = new byte[]{'I','I',0x2a, 0x00};
+ TIFF_SIGNATURES[2] = new byte[]{'M','M', 0x00, 0x2b};
+ }
+
private static final Pattern MACRO_TEMPLATE_PATTERN = Pattern.compile("macroenabledtemplate$", Pattern.CASE_INSENSITIVE);
// TODO Remove this constant once we upgrade to POI 3.12 beta 2, then use PackageRelationshipTypes
@@ -86,8 +100,11 @@ public class ZipContainerDetector implements Detector {
int length = tis.peek(prefix);
MediaType type = detectArchiveFormat(prefix, length);
- if (PackageParser.isZipArchive(type)
- && TikaInputStream.isTikaInputStream(input)) {
+
+ if (type == TIFF) {
+ return TIFF;
+ } else if (PackageParser.isZipArchive(type)
+ && TikaInputStream.isTikaInputStream(input)) {
return detectZipFormat(tis);
} else if (!type.equals(MediaType.OCTET_STREAM)) {
return type;
@@ -112,7 +129,28 @@ public class ZipContainerDetector implements Detector {
}
}
+ private static boolean isTiff(byte[] prefix) {
+ for (byte[] sig : TIFF_SIGNATURES) {
+ if(arrayStartWith(sig, prefix)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private static boolean arrayStartWith(byte[] needle, byte[] haystack) {
+ for (int i = 0; i < needle.length; i++) {
+ if (haystack[i] != needle[i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
private static MediaType detectArchiveFormat(byte[] prefix, int length) {
+ if (isTiff(prefix)) {
+ return TIFF;
+ }
try {
String name = ArchiveStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length));
return PackageParser.getMediaType(name);
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
new file mode 100644
index 0000000..2865442
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pkg;
+
+
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+public class ZipContainerDetectorTest extends TikaTest {
+
+ @Test
+ public void testTiffWorkaround() throws Exception {
+ //TIKA-2591
+ ZipContainerDetector zipContainerDetector = new ZipContainerDetector();
+ Metadata metadata = new Metadata();
+ try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF.tif"))) {
+ MediaType mt = zipContainerDetector.detect(is, metadata);
+ assertEquals(MediaType.image("tiff"), mt);
+ }
+ metadata = new Metadata();
+ try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF_multipage.tif"))) {
+ MediaType mt = zipContainerDetector.detect(is, metadata);
+ assertEquals(MediaType.image("tiff"), mt);
+ }
+
+ }
+}
\ No newline at end of file
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.