You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/09/15 17:24:40 UTC
[tika] 02/02: TIKA-2467 refactor creation/configuration of XML
parsers/factories/readers to be static methods.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit c0c2eafe46224e5c316f2dede395308930a5ec0d
Author: tballison <ta...@mitre.org>
AuthorDate: Fri Sep 15 13:24:27 2017 -0400
TIKA-2467 refactor creation/configuration of XML parsers/factories/readers
to be static methods.
---
.../tika/batch/builders/BatchProcessBuilder.java | 3 +-
.../batch/builders/CommandLineParserBuilder.java | 3 +-
.../java/org/apache/tika/config/TikaConfig.java | 17 +-
.../apache/tika/config/TikaConfigSerializer.java | 3 +-
.../java/org/apache/tika/parser/ParseContext.java | 105 ++---------
.../external/ExternalParsersConfigReader.java | 3 +-
.../XMLReaderUtils.java} | 191 +++++++--------------
.../java/org/apache/tika/eval/io/XMLLogReader.java | 3 +-
.../apache/tika/eval/reports/ResultsReporter.java | 3 +-
.../tika/parser/image/ImageMetadataExtractor.java | 3 +-
.../tika/parser/image/xmp/JempboxExtractor.java | 3 +-
11 files changed, 95 insertions(+), 242 deletions(-)
diff --git a/tika-batch/src/main/java/org/apache/tika/batch/builders/BatchProcessBuilder.java b/tika-batch/src/main/java/org/apache/tika/batch/builders/BatchProcessBuilder.java
index df5e4bf..314ea76 100644
--- a/tika-batch/src/main/java/org/apache/tika/batch/builders/BatchProcessBuilder.java
+++ b/tika-batch/src/main/java/org/apache/tika/batch/builders/BatchProcessBuilder.java
@@ -36,6 +36,7 @@ import org.apache.tika.io.IOExceptionWithCause;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.util.ClassLoaderUtil;
import org.apache.tika.util.XMLDOMUtil;
+import org.apache.tika.utils.XMLReaderUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
@@ -67,7 +68,7 @@ public class BatchProcessBuilder {
public BatchProcess build(InputStream is, Map<String,String> runtimeAttributes) throws IOException {
Document doc = null;
try {
- DocumentBuilder docBuilder = new ParseContext().getDocumentBuilder();
+ DocumentBuilder docBuilder = XMLReaderUtils.getDocumentBuilder();
doc = docBuilder.parse(is);
} catch (TikaException|SAXException e) {
throw new IOExceptionWithCause(e);
diff --git a/tika-batch/src/main/java/org/apache/tika/batch/builders/CommandLineParserBuilder.java b/tika-batch/src/main/java/org/apache/tika/batch/builders/CommandLineParserBuilder.java
index 66b55bf..e58f163 100644
--- a/tika-batch/src/main/java/org/apache/tika/batch/builders/CommandLineParserBuilder.java
+++ b/tika-batch/src/main/java/org/apache/tika/batch/builders/CommandLineParserBuilder.java
@@ -27,6 +27,7 @@ import org.apache.commons.cli.Options;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOExceptionWithCause;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
@@ -43,7 +44,7 @@ public class CommandLineParserBuilder {
public Options build(InputStream is) throws IOException {
Document doc = null;
try {
- DocumentBuilder docBuilder = new ParseContext().getDocumentBuilder();
+ DocumentBuilder docBuilder = XMLReaderUtils.getDocumentBuilder();
doc = docBuilder.parse(is);
} catch (TikaException|SAXException e) {
throw new IOExceptionWithCause(e);
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 8ca0d6b..28ac235 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -65,6 +65,7 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.utils.AnnotationUtils;
+import org.apache.tika.utils.XMLReaderUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
@@ -127,7 +128,7 @@ public class TikaConfig {
}
public TikaConfig(Path path, ServiceLoader loader)
throws TikaException, IOException, SAXException {
- this(getBuilder().parse(path.toFile()), loader);
+ this(XMLReaderUtils.getDocumentBuilder().parse(path.toFile()), loader);
}
public TikaConfig(File file)
@@ -136,7 +137,7 @@ public class TikaConfig {
}
public TikaConfig(File file, ServiceLoader loader)
throws TikaException, IOException, SAXException {
- this(getBuilder().parse(file), loader);
+ this(XMLReaderUtils.getDocumentBuilder().parse(file), loader);
}
public TikaConfig(URL url)
@@ -145,16 +146,16 @@ public class TikaConfig {
}
public TikaConfig(URL url, ClassLoader loader)
throws TikaException, IOException, SAXException {
- this(getBuilder().parse(url.toString()).getDocumentElement(), loader);
+ this(XMLReaderUtils.getDocumentBuilder().parse(url.toString()).getDocumentElement(), loader);
}
public TikaConfig(URL url, ServiceLoader loader)
throws TikaException, IOException, SAXException {
- this(getBuilder().parse(url.toString()).getDocumentElement(), loader);
+ this(XMLReaderUtils.getDocumentBuilder().parse(url.toString()).getDocumentElement(), loader);
}
public TikaConfig(InputStream stream)
throws TikaException, IOException, SAXException {
- this(getBuilder().parse(stream));
+ this(XMLReaderUtils.getDocumentBuilder().parse(stream));
}
public TikaConfig(Document document) throws TikaException, IOException {
@@ -250,7 +251,7 @@ public class TikaConfig {
} else {
ServiceLoader tmpServiceLoader = new ServiceLoader();
try (InputStream stream = getConfigInputStream(config, tmpServiceLoader)) {
- Element element = getBuilder().parse(stream).getDocumentElement();
+ Element element = XMLReaderUtils.getDocumentBuilder().parse(stream).getDocumentElement();
serviceLoader = serviceLoaderFromDomElement(element, tmpServiceLoader.getLoader());
DetectorXmlLoader detectorLoader = new DetectorXmlLoader();
EncodingDetectorXmlLoader encodingDetectorLoader = new EncodingDetectorXmlLoader();
@@ -390,10 +391,6 @@ public class TikaConfig {
}
}
- private static DocumentBuilder getBuilder() throws TikaException {
- return new ParseContext().getDocumentBuilder();
- }
-
private static Element getChild(Element element, String name) {
Node child = element.getFirstChild();
while (child != null) {
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java
index 3a2249e..c67b03b 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java
@@ -45,6 +45,7 @@ import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.utils.XMLReaderUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
@@ -74,7 +75,7 @@ public class TikaConfigSerializer {
*/
public static void serialize(TikaConfig config, Mode mode, Writer writer, Charset charset)
throws Exception {
- DocumentBuilder docBuilder = new ParseContext().getDocumentBuilder();
+ DocumentBuilder docBuilder = XMLReaderUtils.getDocumentBuilder();
// root elements
Document doc = docBuilder.newDocument();
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
index 2521cc9..332efcd 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
@@ -23,19 +23,12 @@ import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.stream.XMLInputFactory;
-import javax.xml.stream.XMLResolver;
-import javax.xml.stream.XMLStreamException;
-import java.io.IOException;
import java.io.Serializable;
-import java.io.StringReader;
-import java.lang.reflect.Method;
import java.util.HashMap;
import java.util.Map;
import org.apache.tika.exception.TikaException;
-import org.xml.sax.EntityResolver;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
+import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import org.xml.sax.XMLReader;
@@ -54,21 +47,6 @@ public class ParseContext implements Serializable {
/** Map of objects in this context */
private final Map<String, Object> context = new HashMap<String, Object>();
- private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER = new EntityResolver() {
- public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException {
- return new InputSource(new StringReader(""));
- }
- };
-
- private static final XMLResolver IGNORING_STAX_ENTITY_RESOLVER =
- new XMLResolver() {
- @Override
- public Object resolveEntity(String publicID, String systemID, String baseURI, String namespace) throws
- XMLStreamException {
- return "";
- }
- };
-
/**
* Adds the given value to the context as an implementation of the given
* interface.
@@ -129,13 +107,7 @@ public class ParseContext implements Serializable {
if (reader != null) {
return reader;
}
- try {
- reader = getSAXParser().getXMLReader();
- } catch (SAXException e) {
- throw new TikaException("Unable to create an XMLReader", e);
- }
- reader.setEntityResolver(IGNORING_SAX_ENTITY_RESOLVER);
- return reader;
+ return XMLReaderUtils.getXMLReader();
}
/**
@@ -153,13 +125,7 @@ public class ParseContext implements Serializable {
if (parser != null) {
return parser;
} else {
- try {
- return getSAXParserFactory().newSAXParser();
- } catch (ParserConfigurationException e) {
- throw new TikaException("Unable to configure a SAX parser", e);
- } catch (SAXException e) {
- throw new TikaException("Unable to create a SAX parser", e);
- }
+ return XMLReaderUtils.getSAXParser();
}
}
@@ -210,21 +176,16 @@ public class ParseContext implements Serializable {
DocumentBuilderFactory documentBuilderFactory = get(DocumentBuilderFactory.class);
if (documentBuilderFactory != null) {
return documentBuilderFactory;
+ } else {
+ return XMLReaderUtils.getDocumentBuilderFactory();
}
- documentBuilderFactory = DocumentBuilderFactory.newInstance();
- documentBuilderFactory.setNamespaceAware(true);
- documentBuilderFactory.setValidating(false);
- tryToSetSAXFeatureOnDOMFactory(documentBuilderFactory,
- XMLConstants.FEATURE_SECURE_PROCESSING, true);
- tryToSetXercesManager(documentBuilderFactory);
- return documentBuilderFactory;
}
/**
* Returns the DOM builder specified in this parsing context.
* If a builder is not explicitly specified, then a builder
* instance is created and returned. The builder instance is
- * configured to apply an {@link #IGNORING_SAX_ENTITY_RESOLVER},
+ * configured to apply an {@link XMLReaderUtils#IGNORING_SAX_ENTITY_RESOLVER},
* and it sets the ErrorHandler to <code>null</code>.
*
* @since Apache Tika 1.13
@@ -234,15 +195,8 @@ public class ParseContext implements Serializable {
DocumentBuilder documentBuilder = get(DocumentBuilder.class);
if (documentBuilder != null) {
return documentBuilder;
- }
- try {
- DocumentBuilderFactory documentBuilderFactory = getDocumentBuilderFactory();
- documentBuilder = documentBuilderFactory.newDocumentBuilder();
- documentBuilder.setEntityResolver(IGNORING_SAX_ENTITY_RESOLVER);
- documentBuilder.setErrorHandler(null);
- return documentBuilder;
- } catch (ParserConfigurationException e) {
- throw new TikaException("XML parser not available", e);
+ } else {
+ return XMLReaderUtils.getDocumentBuilder();
}
}
@@ -251,7 +205,7 @@ public class ParseContext implements Serializable {
* If a factory is not explicitly specified, then a default factory
* instance is created and returned. The default factory instance is
* configured to be namespace-aware and to apply reasonable security
- * using the {@link #IGNORING_STAX_ENTITY_RESOLVER}.
+ * using the {@link XMLReaderUtils#IGNORING_STAX_ENTITY_RESOLVER}.
*
* @since Apache Tika 1.13
* @return StAX input factory
@@ -261,46 +215,7 @@ public class ParseContext implements Serializable {
if (factory != null) {
return factory;
}
- factory = XMLInputFactory.newFactory();
-
- tryToSetStaxProperty(factory, XMLInputFactory.IS_NAMESPACE_AWARE, true);
- tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false);
-
- factory.setXMLResolver(IGNORING_STAX_ENTITY_RESOLVER);
- return factory;
- }
-
- private static void tryToSetSAXFeatureOnDOMFactory(DocumentBuilderFactory dbf, String feature, boolean value) {
- try {
- dbf.setFeature(feature, value);
- } catch (Exception|AbstractMethodError e) {
- }
- }
-
- private static void tryToSetXercesManager(DocumentBuilderFactory dbf) {
- // Try built-in JVM one first, standalone if not
- for (String securityManagerClassName : new String[] {
- "com.sun.org.apache.xerces.internal.util.SecurityManager",
- "org.apache.xerces.util.SecurityManager"
- }) {
- try {
- Object mgr = Class.forName(securityManagerClassName).newInstance();
- Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE);
- setLimit.invoke(mgr, 4096);
- dbf.setAttribute("http://apache.org/xml/properties/security-manager", mgr);
- // Stop once one can be setup without error
- return;
- } catch (Throwable t) {
- }
- }
- }
-
- private void tryToSetStaxProperty(XMLInputFactory factory, String key, boolean value) {
- try {
- factory.setProperty(key, value);
- } catch (IllegalArgumentException e) {
- //swallow
- }
+ return XMLReaderUtils.getXMLInputFactory();
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
index 2fd3cb5..1df02e2 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
@@ -32,6 +32,7 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
@@ -54,7 +55,7 @@ public final class ExternalParsersConfigReader implements ExternalParsersConfigR
public static List<ExternalParser> read(InputStream stream) throws TikaException, IOException {
try {
- DocumentBuilder builder = new ParseContext().getDocumentBuilder();
+ DocumentBuilder builder = XMLReaderUtils.getDocumentBuilder();
Document document = builder.parse(new InputSource(stream));
return read(document);
} catch (SAXException e) {
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
similarity index 58%
copy from tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
copy to tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 2521cc9..6f24708 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -1,20 +1,29 @@
-/*
+/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser;
+
+package org.apache.tika.utils;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.EntityResolver;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXNotRecognizedException;
+import org.xml.sax.SAXNotSupportedException;
+import org.xml.sax.XMLReader;
import javax.xml.XMLConstants;
import javax.xml.parsers.DocumentBuilder;
@@ -26,33 +35,15 @@ import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLResolver;
import javax.xml.stream.XMLStreamException;
import java.io.IOException;
-import java.io.Serializable;
import java.io.StringReader;
import java.lang.reflect.Method;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.tika.exception.TikaException;
-import org.xml.sax.EntityResolver;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-import org.xml.sax.SAXNotRecognizedException;
-import org.xml.sax.SAXNotSupportedException;
-import org.xml.sax.XMLReader;
/**
- * Parse context. Used to pass context information to Tika parsers.
- *
- * @since Apache Tika 0.5
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
+ * Utility functions for reading XML. If you are doing SAX parsing, make sure
+ * to use the {@link org.apache.tika.sax.OfflineContentHandler} to guard against
+ * XML External Entity attacks.
*/
-public class ParseContext implements Serializable {
-
- /** Serial version UID. */
- private static final long serialVersionUID = -5921436862145826534L;
-
- /** Map of objects in this context */
- private final Map<String, Object> context = new HashMap<String, Object>();
+public class XMLReaderUtils {
private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER = new EntityResolver() {
public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException {
@@ -70,51 +61,6 @@ public class ParseContext implements Serializable {
};
/**
- * Adds the given value to the context as an implementation of the given
- * interface.
- *
- * @param key the interface implemented by the given value
- * @param value the value to be added, or <code>null</code> to remove
- */
- public <T> void set(Class<T> key, T value) {
- if (value != null) {
- context.put(key.getName(), value);
- } else {
- context.remove(key.getName());
- }
- }
-
- /**
- * Returns the object in this context that implements the given interface.
- *
- * @param key the interface implemented by the requested object
- * @return the object that implements the given interface,
- * or <code>null</code> if not found
- */
- @SuppressWarnings("unchecked")
- public <T> T get(Class<T> key) {
- return (T) context.get(key.getName());
- }
-
- /**
- * Returns the object in this context that implements the given interface,
- * or the given default value if such an object is not found.
- *
- * @param key the interface implemented by the requested object
- * @param defaultValue value to return if the requested object is not found
- * @return the object that implements the given interface,
- * or the given default value if not found
- */
- public <T> T get(Class<T> key, T defaultValue) {
- T value = get(key);
- if (value != null) {
- return value;
- } else {
- return defaultValue;
- }
- }
-
- /**
* Returns the XMLReader specified in this parsing context. If a reader
* is not explicitly specified, then one is created using the specified
* or the default SAX parser.
@@ -124,11 +70,8 @@ public class ParseContext implements Serializable {
* @return XMLReader
* @throws TikaException
*/
- public XMLReader getXMLReader() throws TikaException {
- XMLReader reader = get(XMLReader.class);
- if (reader != null) {
- return reader;
- }
+ public static XMLReader getXMLReader() throws TikaException {
+ XMLReader reader;
try {
reader = getSAXParser().getXMLReader();
} catch (SAXException e) {
@@ -142,24 +85,24 @@ public class ParseContext implements Serializable {
* Returns the SAX parser specified in this parsing context. If a parser
* is not explicitly specified, then one is created using the specified
* or the default SAX parser factory.
+ * <p>
+ * Make sure to wrap your handler in the {@link org.apache.tika.sax.OfflineContentHandler} to
+ * prevent XML External Entity attacks
+ * </p>
+
*
* @see #getSAXParserFactory()
* @since Apache Tika 0.8
* @return SAX parser
* @throws TikaException if a SAX parser could not be created
*/
- public SAXParser getSAXParser() throws TikaException {
- SAXParser parser = get(SAXParser.class);
- if (parser != null) {
- return parser;
- } else {
- try {
- return getSAXParserFactory().newSAXParser();
- } catch (ParserConfigurationException e) {
- throw new TikaException("Unable to configure a SAX parser", e);
- } catch (SAXException e) {
- throw new TikaException("Unable to create a SAX parser", e);
- }
+ public static SAXParser getSAXParser() throws TikaException {
+ try {
+ return getSAXParserFactory().newSAXParser();
+ } catch (ParserConfigurationException e) {
+ throw new TikaException("Unable to configure a SAX parser", e);
+ } catch (SAXException e) {
+ throw new TikaException("Unable to create a SAX parser", e);
}
}
@@ -169,29 +112,31 @@ public class ParseContext implements Serializable {
* instance is created and returned. The default factory instance is
* configured to be namespace-aware, not validating, and to use
* {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
+ * <p>
+ * Make sure to wrap your handler in the {@link org.apache.tika.sax.OfflineContentHandler} to
+ * prevent XML External Entity attacks
+ * </p>
*
* @since Apache Tika 0.8
* @return SAX parser factory
*/
- public SAXParserFactory getSAXParserFactory() {
- SAXParserFactory factory = get(SAXParserFactory.class);
- if (factory == null) {
- factory = SAXParserFactory.newInstance();
- factory.setNamespaceAware(true);
- factory.setValidating(false);
- try {
- factory.setFeature(
- XMLConstants.FEATURE_SECURE_PROCESSING, true);
- } catch (ParserConfigurationException e) {
- } catch (SAXNotSupportedException e) {
- } catch (SAXNotRecognizedException e) {
- // TIKA-271: Some XML parsers do not support the
- // secure-processing feature, even though it's required by
- // JAXP in Java 5. Ignoring the exception is fine here, as
- // deployments without this feature are inherently vulnerable
- // to XML denial-of-service attacks.
- }
+ public static SAXParserFactory getSAXParserFactory() {
+ SAXParserFactory factory = SAXParserFactory.newInstance();
+ factory.setNamespaceAware(true);
+ factory.setValidating(false);
+ try {
+ factory.setFeature(
+ XMLConstants.FEATURE_SECURE_PROCESSING, true);
+ } catch (ParserConfigurationException e) {
+ } catch (SAXNotSupportedException e) {
+ } catch (SAXNotRecognizedException e) {
+ // TIKA-271: Some XML parsers do not support the
+ // secure-processing feature, even though it's required by
+ // JAXP in Java 5. Ignoring the exception is fine here, as
+ // deployments without this feature are inherently vulnerable
+ // to XML denial-of-service attacks.
}
+
return factory;
}
@@ -205,17 +150,13 @@ public class ParseContext implements Serializable {
* @since Apache Tika 1.13
* @return DOM parser factory
*/
- private DocumentBuilderFactory getDocumentBuilderFactory() {
+ public static DocumentBuilderFactory getDocumentBuilderFactory() {
//borrowed from Apache POI
- DocumentBuilderFactory documentBuilderFactory = get(DocumentBuilderFactory.class);
- if (documentBuilderFactory != null) {
- return documentBuilderFactory;
- }
- documentBuilderFactory = DocumentBuilderFactory.newInstance();
+ DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
documentBuilderFactory.setNamespaceAware(true);
documentBuilderFactory.setValidating(false);
tryToSetSAXFeatureOnDOMFactory(documentBuilderFactory,
- XMLConstants.FEATURE_SECURE_PROCESSING, true);
+ XMLConstants.FEATURE_SECURE_PROCESSING, true);
tryToSetXercesManager(documentBuilderFactory);
return documentBuilderFactory;
}
@@ -230,14 +171,10 @@ public class ParseContext implements Serializable {
* @since Apache Tika 1.13
* @return DOM Builder
*/
- public DocumentBuilder getDocumentBuilder() throws TikaException {
- DocumentBuilder documentBuilder = get(DocumentBuilder.class);
- if (documentBuilder != null) {
- return documentBuilder;
- }
+ public static DocumentBuilder getDocumentBuilder() throws TikaException {
try {
DocumentBuilderFactory documentBuilderFactory = getDocumentBuilderFactory();
- documentBuilder = documentBuilderFactory.newDocumentBuilder();
+ DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
documentBuilder.setEntityResolver(IGNORING_SAX_ENTITY_RESOLVER);
documentBuilder.setErrorHandler(null);
return documentBuilder;
@@ -256,12 +193,8 @@ public class ParseContext implements Serializable {
* @since Apache Tika 1.13
* @return StAX input factory
*/
- public XMLInputFactory getXMLInputFactory() {
- XMLInputFactory factory = get(XMLInputFactory.class);
- if (factory != null) {
- return factory;
- }
- factory = XMLInputFactory.newFactory();
+ public static XMLInputFactory getXMLInputFactory() {
+ XMLInputFactory factory = XMLInputFactory.newFactory();
tryToSetStaxProperty(factory, XMLInputFactory.IS_NAMESPACE_AWARE, true);
tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false);
@@ -273,13 +206,13 @@ public class ParseContext implements Serializable {
private static void tryToSetSAXFeatureOnDOMFactory(DocumentBuilderFactory dbf, String feature, boolean value) {
try {
dbf.setFeature(feature, value);
- } catch (Exception|AbstractMethodError e) {
+ } catch (Exception | AbstractMethodError e) {
}
}
private static void tryToSetXercesManager(DocumentBuilderFactory dbf) {
// Try built-in JVM one first, standalone if not
- for (String securityManagerClassName : new String[] {
+ for (String securityManagerClassName : new String[]{
"com.sun.org.apache.xerces.internal.util.SecurityManager",
"org.apache.xerces.util.SecurityManager"
}) {
@@ -295,7 +228,7 @@ public class ParseContext implements Serializable {
}
}
- private void tryToSetStaxProperty(XMLInputFactory factory, String key, boolean value) {
+ private static void tryToSetStaxProperty(XMLInputFactory factory, String key, boolean value) {
try {
factory.setProperty(key, value);
} catch (IllegalArgumentException e) {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java b/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java
index 753866b..87e4c40 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java
@@ -28,6 +28,7 @@ import java.sql.SQLException;
import org.apache.log4j.Level;
import org.apache.tika.io.IOUtils;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -39,7 +40,7 @@ public class XMLLogReader {
public void read(InputStream xmlLogFileIs, XMLLogMsgHandler handler) throws XMLStreamException {
InputStream is = new LogXMLWrappingInputStream(xmlLogFileIs);
- XMLInputFactory factory = new ParseContext().getXMLInputFactory();
+ XMLInputFactory factory = XMLReaderUtils.getXMLInputFactory();
XMLStreamReader reader = factory.createXMLStreamReader(is);
Level level = null;
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java
index a9e541a..db2a171 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java
@@ -45,6 +45,7 @@ import org.apache.tika.eval.ExtractProfiler;
import org.apache.tika.eval.db.H2Util;
import org.apache.tika.eval.db.JDBCUtil;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
@@ -104,7 +105,7 @@ public class ResultsReporter {
ResultsReporter r = new ResultsReporter();
- DocumentBuilder docBuilder = new ParseContext().getDocumentBuilder();
+ DocumentBuilder docBuilder = XMLReaderUtils.getDocumentBuilder();
Document doc;
try (InputStream is = Files.newInputStream(p)) {
doc = docBuilder.parse(is);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
index 64ddf73..d2ee0c6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
@@ -59,6 +59,7 @@ import org.apache.tika.metadata.TIFF;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.image.xmp.JempboxExtractor;
+import org.apache.tika.utils.XMLReaderUtils;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;
@@ -183,7 +184,7 @@ public class ImageMetadataExtractor {
try (InputStream decoded =
new ByteArrayInputStream(xmpData)
) {
- Document dom = new ParseContext().getDocumentBuilder().parse(decoded);
+ Document dom = XMLReaderUtils.getDocumentBuilder().parse(decoded);
if (dom != null) {
xmp = new XMPMetadata(dom);
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
index 6d5038a..e847615 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
@@ -37,6 +37,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.XMPMM;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.utils.DateUtils;
+import org.apache.tika.utils.XMLReaderUtils;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;
@@ -66,7 +67,7 @@ public class JempboxExtractor {
try (InputStream decoded =
new ByteArrayInputStream(xmpraw.toByteArray())
) {
- Document dom = new ParseContext().getDocumentBuilder().parse(decoded);
+ Document dom = XMLReaderUtils.getDocumentBuilder().parse(decoded);
if (dom != null) {
xmp = new XMPMetadata(dom);
}
--
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.