You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2013/07/19 15:46:35 UTC
svn commit: r1504877 - in /tika/trunk/tika-core/src:
main/java/org/apache/tika/detect/MagicDetector.java
test/java/org/apache/tika/detect/MagicDetectorTest.java
Author: nick
Date: Fri Jul 19 13:46:35 2013
New Revision: 1504877
URL: http://svn.apache.org/r1504877
Log:
TIKA-1146 Support for case-insensitive string matching on magic patterns (for ASCII text only - works at a byte level). Also adds more magic detection tests covering several of the string formats
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
tika/trunk/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java?rev=1504877&r1=1504876&r2=1504877&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java Fri Jul 19 13:46:35 2013
@@ -32,6 +32,10 @@ import org.apache.tika.mime.MediaType;
* Content type detection based on magic bytes, i.e. type-specific patterns
* near the beginning of the document input stream.
*
+ * Because this works on bytes, not characters, by default any string
+ * matching is done as ISO_8859_1. To use an explicit different
+ * encoding, supply a type other than "string" / "stringignorecase"
+ *
* @since Apache Tika 0.3
*/
public class MagicDetector implements Detector {
@@ -62,7 +66,8 @@ public class MagicDetector implements De
return new MagicDetector(
mediaType, patternBytes, maskBytes,
- type.equals("regex"), start, end);
+ type.equals("regex"), type.equals("stringignorecase"),
+ start, end);
}
private static byte[] decodeValue(String value, String type) {
@@ -89,6 +94,8 @@ public class MagicDetector implements De
|| type.equals("unicodeLE")
|| type.equals("unicodeBE")) {
decoded = decodeString(value, type);
+ } else if (type.equals("stringignorecase")) {
+ decoded = decodeString(value.toLowerCase(), type);
} else if (type.equals("byte")) {
decoded = tmpVal.getBytes();
} else if (type.equals("host16") || type.equals("little16")) {
@@ -212,6 +219,11 @@ public class MagicDetector implements De
private final boolean isRegex;
/**
+ * True if we're doing a case-insensitive string match, false otherwise.
+ */
+ private final boolean isStringIgnoreCase;
+
+ /**
* Bit mask that is applied to the source bytes before pattern matching.
*/
private final byte[] mask;
@@ -275,6 +287,16 @@ public class MagicDetector implements De
MediaType type, byte[] pattern, byte[] mask,
boolean isRegex,
int offsetRangeBegin, int offsetRangeEnd) {
+ this(type, pattern, mask, isRegex, false, offsetRangeBegin, offsetRangeEnd);
+ }
+ /**
+ * Creates a detector for input documents that meet the specified
+ * magic match.
+ */
+ public MagicDetector(
+ MediaType type, byte[] pattern, byte[] mask,
+ boolean isRegex, boolean isStringIgnoreCase,
+ int offsetRangeBegin, int offsetRangeEnd) {
if (type == null) {
throw new IllegalArgumentException("Matching media type is null");
} else if (pattern == null) {
@@ -289,6 +311,7 @@ public class MagicDetector implements De
this.type = type;
this.isRegex = isRegex;
+ this.isStringIgnoreCase = isStringIgnoreCase;
this.patternLength = Math.max(pattern.length, mask != null ? mask.length : 0);
@@ -365,7 +388,12 @@ public class MagicDetector implements De
}
if (this.isRegex) {
- Pattern p = Pattern.compile(new String(this.pattern));
+ int flags = 0;
+ if (this.isStringIgnoreCase) {
+ flags = Pattern.CASE_INSENSITIVE;
+ }
+
+ Pattern p = Pattern.compile(new String(this.pattern), flags);
ByteBuffer bb = ByteBuffer.wrap(buffer);
CharBuffer result = ISO_8859_1.decode(bb);
@@ -387,8 +415,13 @@ public class MagicDetector implements De
// Loop until we've covered the entire offset range
for (int i = 0; i <= offsetRangeEnd - offsetRangeBegin; i++) {
boolean match = true;
+ int masked;
for (int j = 0; match && j < length; j++) {
- match = (buffer[i + j] & mask[j]) == pattern[j];
+ masked = (buffer[i + j] & mask[j]);
+ if (this.isStringIgnoreCase) {
+ masked = Character.toLowerCase(masked);
+ }
+ match = (masked == pattern[j]);
}
if (match) {
return type;
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java?rev=1504877&r1=1504876&r2=1504877&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java Fri Jul 19 13:46:35 2013
@@ -178,10 +178,39 @@ public class MagicDetectorTest extends T
InputStream stream = new RestrictiveInputStream(data);
assertEquals(testMT, detector.detect(stream, new Metadata()));
}
+
+ public void testDetectString() throws Exception {
+ String data = "abcdEFGhijklmnoPQRstuvwxyz0123456789";
+ MediaType testMT = new MediaType("application", "test");
+ Detector detector;
+
+ // Check regular String matching
+ detector = MagicDetector.parse(testMT, "string", "0:20", "abcd", null);
+ assertDetect(detector, testMT, data.getBytes("ASCII"));
+ detector = MagicDetector.parse(testMT, "string", "0:20", "cdEFGh", null);
+ assertDetect(detector, testMT, data.getBytes("ASCII"));
+
+ // Check Little Endian and Big Endian utf-16 strings
+ detector = MagicDetector.parse(testMT, "unicodeLE", "0:20", "cdEFGh", null);
+ assertDetect(detector, testMT, data.getBytes("UTF-16LE"));
+ detector = MagicDetector.parse(testMT, "unicodeBE", "0:20", "cdEFGh", null);
+ assertDetect(detector, testMT, data.getBytes("UTF-16BE"));
+
+ // Check case ignoring String matching
+ detector = MagicDetector.parse(testMT, "stringignorecase", "0:20", "BcDeFgHiJKlm", null);
+ assertDetect(detector, testMT, data.getBytes("ASCII"));
+ }
private void assertDetect(Detector detector, MediaType type, String data) {
try {
byte[] bytes = data.getBytes("ASCII");
+ assertDetect(detector, type, bytes);
+ } catch (IOException e) {
+ fail("Unexpected exception from MagicDetector");
+ }
+ }
+ private void assertDetect(Detector detector, MediaType type, byte[] bytes) {
+ try {
InputStream stream = new ByteArrayInputStream(bytes);
assertEquals(type, detector.detect(stream, new Metadata()));