You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/11/11 12:25:46 UTC

svn commit: r1200816 - /tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java

Author: jukka
Date: Fri Nov 11 11:25:46 2011
New Revision: 1200816

URL: http://svn.apache.org/viewvc?rev=1200816&view=rev
Log:
TIKA-780: Optimize loading of the media type registry

Use a SAX parser to stream through the mime-info database

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=1200816&r1=1200815&r2=1200816&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java Fri Nov 11 11:25:46 2011
@@ -16,24 +16,28 @@
  */
 package org.apache.tika.mime;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.LinkedList;
 import java.util.List;
 
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.sax.SAXResult;
 
-import org.w3c.dom.Attr;
 import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.NamedNodeMap;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
+import org.xml.sax.Attributes;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
 
 /**
  * A reader for XML files compliant with the freedesktop MIME-info DTD.
@@ -85,25 +89,37 @@ import org.xml.sax.SAXException;
  *         type CDATA #REQUIRED>
  *  ]>
  * </pre>
- * 
- * 
+ *
  * @see http://freedesktop.org/wiki/Standards_2fshared_2dmime_2dinfo_2dspec
- * 
  */
-final class MimeTypesReader implements MimeTypesReaderMetKeys {
+class MimeTypesReader extends DefaultHandler implements MimeTypesReaderMetKeys {
 
     private final MimeTypes types;
 
+    /** Current type */
+    private MimeType type = null;
+
+    private int priority;
+
+    private Clause clause = null;
+
+    private List<Clause> clauses = new LinkedList<Clause>();
+
+    private final LinkedList<List<Clause>> clauseStack =
+            new LinkedList<List<Clause>>();
+
+    private final StringBuilder characters = new StringBuilder();
+
     MimeTypesReader(MimeTypes types) {
         this.types = types;
     }
 
     void read(InputStream stream) throws IOException, MimeTypeException {
         try {
-            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-            DocumentBuilder builder = factory.newDocumentBuilder();
-            Document document = builder.parse(new InputSource(stream));
-            read(document);
+            SAXParserFactory factory = SAXParserFactory.newInstance();
+            factory.setNamespaceAware(false);
+            SAXParser parser = factory.newSAXParser();
+            parser.parse(stream, this);
         } catch (ParserConfigurationException e) {
             throw new MimeTypeException("Unable to create an XML parser", e);
         } catch (SAXException e) {
@@ -112,144 +128,115 @@ final class MimeTypesReader implements M
     }
 
     void read(Document document) throws MimeTypeException {
-        Element element = document.getDocumentElement();
-        if (element != null && element.getTagName().equals(MIME_INFO_TAG)) {
-            NodeList nodes = element.getChildNodes();
-            for (int i = 0; i < nodes.getLength(); i++) {
-                Node node = nodes.item(i);
-                if (node.getNodeType() == Node.ELEMENT_NODE) {
-                    Element child = (Element) node;
-                    if (child.getTagName().equals(MIME_TYPE_TAG)) {
-                        readMimeType(child);
-                    }
-                }
-            }
-        } else {
-            throw new MimeTypeException(
-                    "Not a <" + MIME_INFO_TAG + "/> configuration document: "
-                    + element.getTagName());
+        try {
+            TransformerFactory factory = TransformerFactory.newInstance();
+            Transformer transformer = factory.newTransformer();
+            transformer.transform(new DOMSource(document), new SAXResult(this));
+        } catch (TransformerException e) {
+            throw new MimeTypeException("Failed to parse type registry", e);
         }
     }
 
-    /** Read Element named mime-type. */
-    private void readMimeType(Element element) throws MimeTypeException {
-        String name = element.getAttribute(MIME_TYPE_TYPE_ATTR);
-        MimeType type = types.forName(name);
-
-        NodeList nodes = element.getChildNodes();
-        for (int i = 0; i < nodes.getLength(); i++) {
-            Node node = nodes.item(i);
-            if (node.getNodeType() == Node.ELEMENT_NODE) {
-                Element nodeElement = (Element) node;
-                if (nodeElement.getTagName().equals(COMMENT_TAG)) {
-                    type.setDescription(
-                            nodeElement.getFirstChild().getNodeValue());
-                } else if (nodeElement.getTagName().equals(GLOB_TAG)) {
-                    boolean useRegex = Boolean.valueOf(nodeElement.getAttribute(ISREGEX_ATTR));
-                    types.addPattern(type, nodeElement.getAttribute(PATTERN_ATTR), useRegex);
-                } else if (nodeElement.getTagName().equals(MAGIC_TAG)) {
-                    readMagic(nodeElement, type);
-                } else if (nodeElement.getTagName().equals(ALIAS_TAG)) {
-                    String alias = nodeElement.getAttribute(ALIAS_TYPE_ATTR);
-                    MediaType aliasType = MediaType.parse(alias);
-                    if (aliasType != null) {
-                        types.addAlias(type, aliasType);
-                    } else {
-                        throw new MimeTypeException(
-                                "Invalid media type alias: " + alias);
-                    }
-                } else if (nodeElement.getTagName().equals(ROOT_XML_TAG)) {
-                    readRootXML(nodeElement, type);
-                } else if (nodeElement.getTagName().equals(SUB_CLASS_OF_TAG)) {
-                    String parent = nodeElement.getAttribute(SUB_CLASS_TYPE_ATTR);
-                    types.setSuperType(type, MediaType.parse(parent));
+    @Override
+    public InputSource resolveEntity(String publicId, String systemId) {
+        return new InputSource(new ByteArrayInputStream(new byte[0]));
+    }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String qName,
+            Attributes attributes) throws SAXException {
+        if (type == null) {
+            if (MIME_TYPE_TAG.equals(qName)) {
+                String name = attributes.getValue(MIME_TYPE_TYPE_ATTR);
+                try {
+                    type = types.forName(name);
+                } catch (MimeTypeException e) {
+                    throw new SAXException(e);
                 }
             }
-        }
-
-        types.add(type);
-    }
-
-    /**
-     * Read Element named magic. 
-     * @throws MimeTypeException if the configuration is invalid
-     */
-    private void readMagic(Element element, MimeType mimeType)
-            throws MimeTypeException {
-        int priority = 50;
-        String value = element.getAttribute(MAGIC_PRIORITY_ATTR);
-        if (value != null && value.length() > 0) {
-            priority = Integer.parseInt(value);
-        }
-
-        for (Clause clause : readMatches(element, mimeType.getType())) {
-            Magic magic = new Magic(mimeType, priority, clause);
-            mimeType.addMagic(magic);
-        }
-    }
-
-    private List<Clause> readMatches(Element element, MediaType mediaType) throws MimeTypeException {
-        NodeList nodes = element.getChildNodes();
-        int n = nodes.getLength();
-        if (n == 0) {
-            return Collections.emptyList();
-        }
-
-        List<Clause> clauses = new ArrayList<Clause>();
-        for (int i = 0; i < n; i++) {
-            Node node = nodes.item(i);
-            if (node.getNodeType() == Node.ELEMENT_NODE) {
-                Element nodeElement = (Element) node;
-                if (nodeElement.getTagName().equals(MATCH_TAG)) {
-                    clauses.add(readMatch(nodeElement, mediaType));
+        } else if (ALIAS_TAG.equals(qName)) {
+            String alias = attributes.getValue(ALIAS_TYPE_ATTR);
+            types.addAlias(type, MediaType.parse(alias));
+        } else if (SUB_CLASS_OF_TAG.equals(qName)) {
+            String parent = attributes.getValue(SUB_CLASS_TYPE_ATTR);
+            types.setSuperType(type, MediaType.parse(parent));
+        } else if (GLOB_TAG.equals(qName)) {
+            String pattern = attributes.getValue(PATTERN_ATTR);
+            String isRegex = attributes.getValue(ISREGEX_ATTR);
+            if (pattern != null) {
+                try {
+                    types.addPattern(type, pattern, Boolean.valueOf(isRegex));
+                } catch (MimeTypeException e) {
+                    throw new SAXException(e);
                 }
             }
-        }
-        return clauses;
-    }
-
-    /** Read Element named match. */
-    private Clause readMatch(Element element, MediaType mediaType) throws MimeTypeException {
-        Clause clause = getMagicClause(element, mediaType);
-
-        List<Clause> subClauses = readMatches(element, mediaType);
-        if (subClauses.size() == 0) {
-            return clause;
-        } else if (subClauses.size() == 1) {
-            return new AndClause(clause, subClauses.get(0));
-        } else {
-            return new AndClause(clause, new OrClause(subClauses));
+        } else if (ROOT_XML_TAG.equals(qName)) {
+            String namespace = attributes.getValue(NS_URI_ATTR);
+            String name = attributes.getValue(LOCAL_NAME_ATTR);
+            type.addRootXML(namespace, name);
+        } else if (MATCH_TAG.equals(qName)) {
+            String kind = attributes.getValue(MATCH_TYPE_ATTR);
+            String offset = attributes.getValue(MATCH_OFFSET_ATTR);
+            String value = attributes.getValue(MATCH_VALUE_ATTR);
+            String mask = attributes.getValue(MATCH_MASK_ATTR);
+            if (kind == null) {
+                kind = "string";
+            }
+            clause = new MagicMatch(type.getType(), kind, offset, value, mask);
+            clauseStack.addLast(clauses);
+            clauses = null;
+        } else if (MAGIC_TAG.equals(qName)) {
+            String value = attributes.getValue(MAGIC_PRIORITY_ATTR);
+            if (value != null && value.length() > 0) {
+                priority = Integer.parseInt(value);
+            } else {
+                priority = 50;
+            }
         }
     }
 
-    private Clause getMagicClause(Element element, MediaType mediaType)
-            throws MimeTypeException {
-        String type = "string";
-        String offset = null;
-        String value = null;
-        String mask = null;
-
-        NamedNodeMap attrs = element.getAttributes();
-        for (int i = 0; i < attrs.getLength(); i++) {
-            Attr attr = (Attr) attrs.item(i);
-            if (attr.getName().equals(MATCH_OFFSET_ATTR)) {
-                offset = attr.getValue();
-            } else if (attr.getName().equals(MATCH_TYPE_ATTR)) {
-                type = attr.getValue();
-            } else if (attr.getName().equals(MATCH_VALUE_ATTR)) {
-                value = attr.getValue();
-            } else if (attr.getName().equals(MATCH_MASK_ATTR)) {
-                mask = attr.getValue();
+    @Override
+    public void endElement(String uri, String localName, String qName) {
+        if (type != null) {
+            if (MIME_TYPE_TAG.equals(qName)) {
+                type = null;
+            } else if (COMMENT_TAG.equals(qName)) {
+                type.setDescription(characters.toString().trim());
+            } else if (MATCH_TAG.equals(qName)) {
+                if (clauses != null) {
+                    Clause subclause;
+                    if (clauses.size() == 1) {
+                        subclause = clauses.get(0);
+                    } else {
+                        subclause = new OrClause(clauses);
+                    }
+                    clause = new AndClause(clause, subclause);
+                }
+                clauses = clauseStack.removeLast();
+                if (clauses == null) {
+                    clauses = Collections.singletonList(clause);
+                } else {
+                    if (clauses.size() == 1) {
+                        clauses = new ArrayList<Clause>(clauses);
+                    }
+                    clauses.add(clause);
+                }
+            } else if (MAGIC_TAG.equals(qName)) {
+                if (clauses != null) {
+                    for (Clause clause : clauses) {
+                        type.addMagic(new Magic(type, priority, clause));
+                    }
+                    clauses = null;
+                }
             }
         }
-
-        return new MagicMatch(mediaType, type, offset, value, mask);
+        characters.setLength(0);
     }
 
-    /** Read Element named root-XML. */
-    private void readRootXML(Element element, MimeType mimeType) {
-        mimeType.addRootXML(element.getAttribute(NS_URI_ATTR), element
-                .getAttribute(LOCAL_NAME_ATTR));
+    @Override
+    public void characters(char[] ch, int start, int length) {
+        characters.append(ch, start, length);
     }
 
 }