You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/10/03 23:07:42 UTC
svn commit: r1004050 - in
/tika/trunk/tika-core/src/main/java/org/apache/tika:
detect/XmlRootExtractor.java mime/MimeTypes.java
Author: jukka
Date: Sun Oct 3 21:07:42 2010
New Revision: 1004050
URL: http://svn.apache.org/viewvc?rev=1004050&view=rev
Log:
TIKA-426: Parsing javascript as XML
Make the type detector fall back from application/xml to text/plain if a valid XML root element was not found
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java?rev=1004050&r1=1004049&r2=1004050&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java Sun Oct 3 21:07:42 2010
@@ -37,26 +37,13 @@ import org.xml.sax.helpers.DefaultHandle
*/
public class XmlRootExtractor {
- private final SAXParserFactory factory;
-
- public XmlRootExtractor() throws SAXException, ParserConfigurationException {
- factory = SAXParserFactory.newInstance();
- factory.setNamespaceAware(true);
- factory.setValidating(false);
- try {
- factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
- } catch (SAXNotRecognizedException e) {
- // TIKA-271: Some XML parsers do not support the secure-processing
- // feature, even though it's required by JAXP in Java 5. Ignoring
- // the exception is fine here, deployments without this feature
- // are inherently vulnerable to XML denial-of-service attacks.
- }
-
- }
-
public QName extractRootElement(byte[] data) {
ExtractorHandler handler = new ExtractorHandler();
try {
+ SAXParserFactory factory = SAXParserFactory.newInstance();
+ factory.setNamespaceAware(true);
+ factory.setValidating(false);
+ factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
factory.newSAXParser().parse(
new ByteArrayInputStream(data),
new OfflineContentHandler(handler));
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=1004050&r1=1004049&r2=1004050&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java Sun Oct 3 21:07:42 2010
@@ -144,13 +144,11 @@ public final class MimeTypes implements
/** List of all registered rootXML */
private SortedSet<MimeType> xmls = new TreeSet<MimeType>();
- private transient XmlRootExtractor xmlRootExtractor = null;
-
public MimeTypes() {
rootMimeType = new MimeType(MediaType.OCTET_STREAM);
textMimeType = new MimeType(MediaType.TEXT_PLAIN);
xmlMimeType = new MimeType(MediaType.APPLICATION_XML);
-
+
add(rootMimeType);
add(textMimeType);
add(xmlMimeType);
@@ -229,33 +227,29 @@ public final class MimeTypes implements
}
if (result != null) {
- try {
- XmlRootExtractor extractor = xmlRootExtractor;
- if (extractor == null) {
- extractor = new XmlRootExtractor();
- xmlRootExtractor = extractor;
- }
-
- // When detecting generic XML (or possibly XHTML),
- // extract the root element and match it against known types
- if ("application/xml".equals(result.getName())
- || "text/html".equals(result.getName())) {
- QName rootElement = xmlRootExtractor.extractRootElement(data);
- if (rootElement != null) {
- for (MimeType type : xmls) {
- if (type.matchesXML(
- rootElement.getNamespaceURI(),
- rootElement.getLocalPart())) {
- result = type;
- break;
- }
+ // When detecting generic XML (or possibly XHTML),
+ // extract the root element and match it against known types
+ if ("application/xml".equals(result.getName())
+ || "text/html".equals(result.getName())) {
+ XmlRootExtractor extractor = new XmlRootExtractor();
+
+ QName rootElement = extractor.extractRootElement(data);
+ if (rootElement != null) {
+ for (MimeType type : xmls) {
+ if (type.matchesXML(
+ rootElement.getNamespaceURI(),
+ rootElement.getLocalPart())) {
+ result = type;
+ break;
}
}
+ } else if ("application/xml".equals(result.getName())) {
+ // Downgrade from application/xml to text/plain since
+ // the document seems not to be well-formed.
+ result = textMimeType;
}
- return result;
- } catch (SAXException e) {
- } catch (ParserConfigurationException e) {
}
+ return result;
}