You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2013/07/19 15:46:35 UTC

svn commit: r1504877 - in /tika/trunk/tika-core/src: main/java/org/apache/tika/detect/MagicDetector.java test/java/org/apache/tika/detect/MagicDetectorTest.java

Author: nick
Date: Fri Jul 19 13:46:35 2013
New Revision: 1504877

URL: http://svn.apache.org/r1504877
Log:
TIKA-1146 Support for case-insensitive string matching on magic patterns (for ASCII text only - works at a byte level). Also adds more magic detection tests covering several of the string formats

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
    tika/trunk/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java?rev=1504877&r1=1504876&r2=1504877&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java Fri Jul 19 13:46:35 2013
@@ -32,6 +32,10 @@ import org.apache.tika.mime.MediaType;
  * Content type detection based on magic bytes, i.e. type-specific patterns
  * near the beginning of the document input stream.
  *
+ * Because this works on bytes, not characters, by default any string
+ *  matching is done as ISO_8859_1. To use an explicit different
+ *  encoding, supply a type other than "string" / "stringignorecase"
+ *
  * @since Apache Tika 0.3
  */
 public class MagicDetector implements Detector {
@@ -62,7 +66,8 @@ public class MagicDetector implements De
 
         return new MagicDetector(
                 mediaType, patternBytes, maskBytes,
-                type.equals("regex"), start, end);
+                type.equals("regex"), type.equals("stringignorecase"),
+                start, end);
     }
 
     private static byte[] decodeValue(String value, String type) {
@@ -89,6 +94,8 @@ public class MagicDetector implements De
                 || type.equals("unicodeLE")
                 || type.equals("unicodeBE")) {
             decoded = decodeString(value, type);
+        } else if (type.equals("stringignorecase")) {
+            decoded = decodeString(value.toLowerCase(), type);
         } else if (type.equals("byte")) {
             decoded = tmpVal.getBytes();
         } else if (type.equals("host16") || type.equals("little16")) {
@@ -212,6 +219,11 @@ public class MagicDetector implements De
     private final boolean isRegex;
 
     /**
+     * True if we're doing a case-insensitive string match, false otherwise.
+     */
+    private final boolean isStringIgnoreCase;
+
+    /**
      * Bit mask that is applied to the source bytes before pattern matching.
      */
     private final byte[] mask;
@@ -275,6 +287,16 @@ public class MagicDetector implements De
             MediaType type, byte[] pattern, byte[] mask,
             boolean isRegex,
             int offsetRangeBegin, int offsetRangeEnd) {
+        this(type, pattern, mask, isRegex, false, offsetRangeBegin, offsetRangeEnd);
+    }
+    /**
+     * Creates a detector for input documents that meet the specified
+     * magic match.
+     */
+    public MagicDetector(
+            MediaType type, byte[] pattern, byte[] mask,
+            boolean isRegex, boolean isStringIgnoreCase,
+            int offsetRangeBegin, int offsetRangeEnd) {
         if (type == null) {
             throw new IllegalArgumentException("Matching media type is null");
         } else if (pattern == null) {
@@ -289,6 +311,7 @@ public class MagicDetector implements De
         this.type = type;
 
         this.isRegex = isRegex;
+        this.isStringIgnoreCase = isStringIgnoreCase;
 
         this.patternLength = Math.max(pattern.length, mask != null ? mask.length : 0);
 
@@ -365,7 +388,12 @@ public class MagicDetector implements De
             }
 
             if (this.isRegex) {
-                Pattern p = Pattern.compile(new String(this.pattern));
+                int flags = 0;
+                if (this.isStringIgnoreCase) {
+                    flags = Pattern.CASE_INSENSITIVE;
+                }
+                
+                Pattern p = Pattern.compile(new String(this.pattern), flags);
 
                 ByteBuffer bb = ByteBuffer.wrap(buffer);
                 CharBuffer result = ISO_8859_1.decode(bb);
@@ -387,8 +415,13 @@ public class MagicDetector implements De
                 // Loop until we've covered the entire offset range
                 for (int i = 0; i <= offsetRangeEnd - offsetRangeBegin; i++) {
                     boolean match = true;
+                    int masked;
                     for (int j = 0; match && j < length; j++) {
-                        match = (buffer[i + j] & mask[j]) == pattern[j];
+                        masked = (buffer[i + j] & mask[j]);
+                        if (this.isStringIgnoreCase) {
+                            masked = Character.toLowerCase(masked);
+                        }
+                        match = (masked == pattern[j]);
                     }
                     if (match) {
                         return type;

Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java?rev=1504877&r1=1504876&r2=1504877&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java Fri Jul 19 13:46:35 2013
@@ -178,10 +178,39 @@ public class MagicDetectorTest extends T
         InputStream stream = new RestrictiveInputStream(data);
         assertEquals(testMT, detector.detect(stream, new Metadata()));
     }
+    
+    public void testDetectString() throws Exception {
+        String data = "abcdEFGhijklmnoPQRstuvwxyz0123456789";
+        MediaType testMT = new MediaType("application", "test");
+        Detector detector;
+        
+        // Check regular String matching
+        detector = MagicDetector.parse(testMT, "string", "0:20", "abcd", null); 
+        assertDetect(detector, testMT, data.getBytes("ASCII"));
+        detector = MagicDetector.parse(testMT, "string", "0:20", "cdEFGh", null); 
+        assertDetect(detector, testMT, data.getBytes("ASCII"));
+        
+        // Check Little Endian and Big Endian utf-16 strings
+        detector = MagicDetector.parse(testMT, "unicodeLE", "0:20", "cdEFGh", null); 
+        assertDetect(detector, testMT, data.getBytes("UTF-16LE"));
+        detector = MagicDetector.parse(testMT, "unicodeBE", "0:20", "cdEFGh", null); 
+        assertDetect(detector, testMT, data.getBytes("UTF-16BE"));
+        
+        // Check case ignoring String matching
+        detector = MagicDetector.parse(testMT, "stringignorecase", "0:20", "BcDeFgHiJKlm", null); 
+        assertDetect(detector, testMT, data.getBytes("ASCII"));
+    }
 
     private void assertDetect(Detector detector, MediaType type, String data) {
         try {
             byte[] bytes = data.getBytes("ASCII");
+            assertDetect(detector, type, bytes);
+        } catch (IOException e) {
+            fail("Unexpected exception from MagicDetector");
+        }
+    }
+    private void assertDetect(Detector detector, MediaType type, byte[] bytes) {
+        try {
             InputStream stream = new ByteArrayInputStream(bytes);
             assertEquals(type, detector.detect(stream, new Metadata()));