You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/11/18 13:14:06 UTC

svn commit: r881744 - in /lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime: MagicMatch.java MimeTypesReader.java

Author: jukka
Date: Wed Nov 18 12:14:06 2009
New Revision: 881744

URL: http://svn.apache.org/viewvc?rev=881744&view=rev
Log:
TIKA-321: Optimize type detection speed

Move the magic pattern parsing code to MimeTypesReader

Modified:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java?rev=881744&r1=881743&r2=881744&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java Wed Nov 18 12:14:06 2009
@@ -17,9 +17,7 @@
 package org.apache.tika.mime;
 
 import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
 import java.io.IOException;
-import java.util.Arrays;
 
 import org.apache.tika.detect.MagicDetector;
 import org.apache.tika.metadata.Metadata;
@@ -29,141 +27,20 @@
  */
 class MagicMatch implements Clause {
 
-    private static final MediaType MATCH =
-        new MediaType("x-tika", "magic-match");
-
-    private final int length;
-
     private final MagicDetector detector;
 
-    MagicMatch(int offsetStart, int offsetEnd, String type, String mask,
-            String value) throws MimeTypeException {
-
-        byte[] patternBytes = decodeValue(type, value);
-        byte[] maskBytes;
-        if (mask != null) {
-            maskBytes = decodeValue(type, mask);
-        } else {
-            maskBytes = new byte[patternBytes.length];
-            Arrays.fill(maskBytes, (byte) 0xff);
-        }
-        this.length = Math.max(patternBytes.length, maskBytes.length);
-
-        if (patternBytes.length < length) {
-            byte[] buffer = new byte[length];
-            System.arraycopy(patternBytes, 0, buffer, 0, patternBytes.length);
-            patternBytes = buffer;
-        } else if (maskBytes.length < length) {
-            byte[] buffer = new byte[length];
-            Arrays.fill(buffer, (byte) 0xff);
-            System.arraycopy(maskBytes, 0, buffer, 0, maskBytes.length);
-            maskBytes = buffer;
-        }
-
-        for (int i = 0; i < length; i++) {
-            patternBytes[i] &= maskBytes[i];
-        }
-
-        this.detector = new MagicDetector(
-                MATCH, patternBytes, maskBytes, offsetStart, offsetEnd);
-    }
-
-    private byte[] decodeValue(String type, String value)
-            throws MimeTypeException {
-        // Preliminary check
-        if ((value == null) || (type == null)) {
-            return null;
-        }
-
-        byte[] decoded = null;
-        String tmpVal = null;
-        int radix = 8;
-
-        // hex
-        if (value.startsWith("0x")) {
-            tmpVal = value.substring(2);
-            radix = 16;
-        } else {
-            tmpVal = value;
-            radix = 8;
-        }
-
-        if (type.equals("string")) {
-            decoded = decodeString(value);
-
-        } else if (type.equals("byte")) {
-            decoded = tmpVal.getBytes();
-
-        } else if (type.equals("host16") || type.equals("little16")) {
-            int i = Integer.parseInt(tmpVal, radix);
-            decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
-
-        } else if (type.equals("big16")) {
-            int i = Integer.parseInt(tmpVal, radix);
-            decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
-
-        } else if (type.equals("host32") || type.equals("little32")) {
-            long i = Long.parseLong(tmpVal, radix);
-            decoded = new byte[] { (byte) ((i & 0x000000FF)),
-                    (byte) ((i & 0x0000FF00) >> 8),
-                    (byte) ((i & 0x00FF0000) >> 16),
-                    (byte) ((i & 0xFF000000) >> 24) };
-
-        } else if (type.equals("big32")) {
-            long i = Long.parseLong(tmpVal, radix);
-            decoded = new byte[] { (byte) ((i & 0xFF000000) >> 24),
-                    (byte) ((i & 0x00FF0000) >> 16),
-                    (byte) ((i & 0x0000FF00) >> 8), (byte) ((i & 0x000000FF)) };
-        }
-        return decoded;
-    }
-
-    private byte[] decodeString(String value) throws MimeTypeException {
-        if (value.startsWith("0x")) {
-            byte[] bytes = new byte[(value.length() - 2) / 2];
-            for (int i = 0; i < bytes.length; i++) {
-                bytes[i] = (byte)
-                    Integer.parseInt(value.substring(2 + i * 2, 4 + i * 2), 16);
-            }
-            return bytes;
-        }
-
-        try {
-            ByteArrayOutputStream decoded = new ByteArrayOutputStream();
+    private final int length;
 
-            for (int i = 0; i < value.length(); i++) {
-                if (value.charAt(i) == '\\') {
-                    if (value.charAt(i + 1) == '\\') {
-                        decoded.write('\\');
-                        i++;
-                    } else if (value.charAt(i + 1) == 'x') {
-                        decoded.write(Integer.parseInt(
-                                value.substring(i + 2, i + 4), 16));
-                        i += 3;
-                    } else {
-                        int j = i + 1;
-                        while ((j < i + 4) && (j < value.length())
-                                && (Character.isDigit(value.charAt(j)))) {
-                            j++;
-                        }
-                        decoded.write(Short.decode(
-                                "0" + value.substring(i + 1, j)).byteValue());
-                        i = j - 1;
-                    }
-                } else {
-                    decoded.write(value.charAt(i));
-                }
-            }
-            return decoded.toByteArray();
-        } catch (NumberFormatException e) {
-            throw new MimeTypeException("Invalid string value: " + value, e);
-        }
+    MagicMatch(MagicDetector detector, int length) throws MimeTypeException {
+        this.detector = detector;
+        this.length = length;
     }
 
     public boolean eval(byte[] data) {
         try {
             return detector.detect(
-                    new ByteArrayInputStream(data), new Metadata()) == MATCH;
+                    new ByteArrayInputStream(data), new Metadata())
+                    != MediaType.OCTET_STREAM;
         } catch (IOException e) {
             // Should never happen with a ByteArrayInputStream
             return false;

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=881744&r1=881743&r2=881744&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java Wed Nov 18 12:14:06 2009
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -16,7 +16,7 @@
  */
 package org.apache.tika.mime;
 
-// DOM imports
+import org.apache.tika.detect.MagicDetector;
 import org.w3c.dom.Attr;
 import org.w3c.dom.Node;
 import org.w3c.dom.Element;
@@ -26,7 +26,7 @@
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 
-// JDK imports
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import javax.xml.parsers.DocumentBuilder;
@@ -208,17 +208,25 @@
 
     /** Read Element named match. */
     private MagicMatch readMatch(Element element) throws MimeTypeException {
-
-        String offset = null;
+        String type = "string";
+        int start = 0;
+        int end = 0;
         String value = null;
         String mask = null;
-        String type = null;
 
         NamedNodeMap attrs = element.getAttributes();
         for (int i = 0; i < attrs.getLength(); i++) {
             Attr attr = (Attr) attrs.item(i);
             if (attr.getName().equals(MATCH_OFFSET_ATTR)) {
-                offset = attr.getValue();
+                String offset = attr.getValue();
+                int colon = offset.indexOf(':');
+                if (colon == -1) {
+                    start = Integer.parseInt(offset);
+                    end = start;
+                } else {
+                    start = Integer.parseInt(offset.substring(0, colon));
+                    end = Integer.parseInt(offset.substring(colon + 1));
+                }
             } else if (attr.getName().equals(MATCH_TYPE_ATTR)) {
                 type = attr.getValue();
             } else if (attr.getName().equals(MATCH_VALUE_ATTR)) {
@@ -228,22 +236,116 @@
             }
         }
 
-        // Parse OffSet
-        int offStart = 0;
-        int offEnd = 0;
-        if (offset != null) {
-            int colon = offset.indexOf(':');
-            if (colon == -1) {
-                offStart = Integer.parseInt(offset);
-                offEnd = offStart;
-            } else {
-                offStart = Integer.parseInt(offset.substring(0, colon));
-                offEnd = Integer.parseInt(offset.substring(colon + 1));
-                offEnd = Math.max(offStart, offEnd);
+        if (value == null) {
+            throw new MimeTypeException("Missing magic byte pattern");
+        } else if (start < 0 || end < start) {
+            throw new MimeTypeException(
+                    "Invalid offset range: [" + start + "," + end + "]");
+        }
+
+        byte[] patternBytes = decodeValue(type, value);
+        int length = patternBytes.length;
+        byte[] maskBytes = null;
+        if (mask != null) {
+            maskBytes = decodeValue(type, mask);
+            length = Math.max(patternBytes.length, maskBytes.length);
+        }
+
+        MagicDetector detector = new MagicDetector(
+                MediaType.TEXT_PLAIN, patternBytes, maskBytes, start, end);
+        return new MagicMatch(detector, length);
+    }
+
+    private byte[] decodeValue(String type, String value)
+            throws MimeTypeException {
+        // Preliminary check
+        if ((value == null) || (type == null)) {
+            return null;
+        }
+
+        byte[] decoded = null;
+        String tmpVal = null;
+        int radix = 8;
+
+        // hex
+        if (value.startsWith("0x")) {
+            tmpVal = value.substring(2);
+            radix = 16;
+        } else {
+            tmpVal = value;
+            radix = 8;
+        }
+
+        if (type.equals("string")) {
+            decoded = decodeString(value);
+
+        } else if (type.equals("byte")) {
+            decoded = tmpVal.getBytes();
+
+        } else if (type.equals("host16") || type.equals("little16")) {
+            int i = Integer.parseInt(tmpVal, radix);
+            decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
+
+        } else if (type.equals("big16")) {
+            int i = Integer.parseInt(tmpVal, radix);
+            decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
+
+        } else if (type.equals("host32") || type.equals("little32")) {
+            long i = Long.parseLong(tmpVal, radix);
+            decoded = new byte[] { (byte) ((i & 0x000000FF)),
+                    (byte) ((i & 0x0000FF00) >> 8),
+                    (byte) ((i & 0x00FF0000) >> 16),
+                    (byte) ((i & 0xFF000000) >> 24) };
+
+        } else if (type.equals("big32")) {
+            long i = Long.parseLong(tmpVal, radix);
+            decoded = new byte[] { (byte) ((i & 0xFF000000) >> 24),
+                    (byte) ((i & 0x00FF0000) >> 16),
+                    (byte) ((i & 0x0000FF00) >> 8), (byte) ((i & 0x000000FF)) };
+        }
+        return decoded;
+    }
+
+    private byte[] decodeString(String value) throws MimeTypeException {
+        if (value.startsWith("0x")) {
+            byte[] bytes = new byte[(value.length() - 2) / 2];
+            for (int i = 0; i < bytes.length; i++) {
+                bytes[i] = (byte)
+                Integer.parseInt(value.substring(2 + i * 2, 4 + i * 2), 16);
             }
+            return bytes;
         }
 
-        return new MagicMatch(offStart, offEnd, type, mask, value);
+        try {
+            ByteArrayOutputStream decoded = new ByteArrayOutputStream();
+
+            for (int i = 0; i < value.length(); i++) {
+                if (value.charAt(i) == '\\') {
+                    if (value.charAt(i + 1) == '\\') {
+                        decoded.write('\\');
+                        i++;
+                    } else if (value.charAt(i + 1) == 'x') {
+                        decoded.write(Integer.parseInt(
+                                value.substring(i + 2, i + 4), 16));
+                        i += 3;
+                    } else {
+                        int j = i + 1;
+                        while ((j < i + 4) && (j < value.length())
+                                && (Character.isDigit(value.charAt(j)))) {
+                            j++;
+                        }
+                        decoded.write(Short.decode(
+                                "0" + value.substring(i + 1, j)).byteValue());
+                        i = j - 1;
+                    }
+                } else {
+                    decoded.write(value.charAt(i));
+                }
+            }
+            return decoded.toByteArray();
+        } catch (NumberFormatException e) {
+            throw new MimeTypeException("Invalid string value: " + value, e);
+        }
     }
 
     /** Read Element named root-XML. */