You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/10/03 23:07:42 UTC
svn commit: r1004050 - in /tika/trunk/tika-core/src/main/java/org/apache/tika: detect/XmlRootExtractor.java mime/MimeTypes.java

Author: jukka
Date: Sun Oct  3 21:07:42 2010
New Revision: 1004050

URL: http://svn.apache.org/viewvc?rev=1004050&view=rev
Log:
TIKA-426: Parsing javascript as XML

Make the type detector fall back from application/xml to text/plain if a valid XML root element was not found

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java?rev=1004050&r1=1004049&r2=1004050&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java Sun Oct  3 21:07:42 2010
@@ -37,26 +37,13 @@ import org.xml.sax.helpers.DefaultHandle
  */
 public class XmlRootExtractor {
 
-    private final SAXParserFactory factory;
-
-    public XmlRootExtractor() throws SAXException, ParserConfigurationException {
-        factory = SAXParserFactory.newInstance();
-        factory.setNamespaceAware(true);
-        factory.setValidating(false);
-        try {
-            factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
-        } catch (SAXNotRecognizedException e) {
-            // TIKA-271: Some XML parsers do not support the secure-processing
-            // feature, even though it's required by JAXP in Java 5. Ignoring
-            // the exception is fine here, deployments without this feature
-            // are inherently vulnerable to XML denial-of-service attacks.
-        }
-
-    }
-
     public QName extractRootElement(byte[] data) {
         ExtractorHandler handler = new ExtractorHandler();
         try {
+            SAXParserFactory factory = SAXParserFactory.newInstance();
+            factory.setNamespaceAware(true);
+            factory.setValidating(false);
+            factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
             factory.newSAXParser().parse(
                     new ByteArrayInputStream(data),
                     new OfflineContentHandler(handler));

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=1004050&r1=1004049&r2=1004050&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java Sun Oct  3 21:07:42 2010
@@ -144,13 +144,11 @@ public final class MimeTypes implements 
     /** List of all registered rootXML */
     private SortedSet<MimeType> xmls = new TreeSet<MimeType>();
 
-    private transient XmlRootExtractor xmlRootExtractor = null;
-
     public MimeTypes() {
         rootMimeType = new MimeType(MediaType.OCTET_STREAM);
         textMimeType = new MimeType(MediaType.TEXT_PLAIN);
         xmlMimeType = new MimeType(MediaType.APPLICATION_XML);
-        
+
         add(rootMimeType);
         add(textMimeType);
         add(xmlMimeType);
@@ -229,33 +227,29 @@ public final class MimeTypes implements 
         }
  
         if (result != null) {
-            try {
-                XmlRootExtractor extractor = xmlRootExtractor;
-                if (extractor  == null) {
-                    extractor = new XmlRootExtractor();
-                    xmlRootExtractor = extractor;
-                }
-
-                // When detecting generic XML (or possibly XHTML),
-                // extract the root element and match it against known types
-                if ("application/xml".equals(result.getName())
-                        || "text/html".equals(result.getName())) {
-                    QName rootElement = xmlRootExtractor.extractRootElement(data);
-                    if (rootElement != null) {
-                        for (MimeType type : xmls) {
-                            if (type.matchesXML(
-                                    rootElement.getNamespaceURI(),
-                                    rootElement.getLocalPart())) {
-                                result = type;
-                                break;
-                            }
+            // When detecting generic XML (or possibly XHTML),
+            // extract the root element and match it against known types
+            if ("application/xml".equals(result.getName())
+                    || "text/html".equals(result.getName())) {
+                XmlRootExtractor extractor = new XmlRootExtractor();
+
+                QName rootElement = extractor.extractRootElement(data);
+                if (rootElement != null) {
+                    for (MimeType type : xmls) {
+                        if (type.matchesXML(
+                                rootElement.getNamespaceURI(),
+                                rootElement.getLocalPart())) {
+                            result = type;
+                            break;
                         }
                     }
+                } else if ("application/xml".equals(result.getName())) {
+                    // Downgrade from application/xml to text/plain since
+                    // the document seems not to be well-formed.
+                    result = textMimeType;
                 }
-                return result;
-            } catch (SAXException e) {
-            } catch (ParserConfigurationException e) {
             }
+            return result;
         }