You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by an...@apache.org on 2021/08/20 15:39:24 UTC

[jena] branch main updated: Centralize XML reading setup.

This is an automated email from the ASF dual-hosted git repository.

andy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/jena.git


The following commit(s) were added to refs/heads/main by this push:
     new 66cae90  Centralize XML reading setup.
     new 73ef978  Merge pull request #1052 from afs/xml-open
66cae90 is described below

commit 66cae9052edef6dd08a172191faad3c50bdbe376
Author: Andy Seaborne <an...@apache.org>
AuthorDate: Mon Aug 9 14:44:56 2021 +0100

    Centralize XML reading setup.
---
 .../java/org/apache/jena/riot/lang/ReaderTriX.java | 30 ++++---
 .../apache/jena/riot/resultset/rw/ResultsStAX.java | 17 ++--
 .../jena/rdfxml/xmlinput/impl/RDFXMLParser.java    | 20 ++---
 .../java/org/apache/jena/util/JenaXMLInput.java    | 94 ++++++++++++++++++++++
 .../apache/jena/rdfxml/xmlinput/DOM2RDFTest.java   | 23 +++---
 .../jena/rdfxml/xmlinput/MoreDOM2RDFTest.java      | 44 +++++-----
 6 files changed, 161 insertions(+), 67 deletions(-)

diff --git a/jena-arq/src/main/java/org/apache/jena/riot/lang/ReaderTriX.java b/jena-arq/src/main/java/org/apache/jena/riot/lang/ReaderTriX.java
index e30edad..bb99365 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/lang/ReaderTriX.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/lang/ReaderTriX.java
@@ -22,24 +22,36 @@ import static org.apache.jena.riot.lang.ReaderTriX.State.*;
 
 import java.io.InputStream;
 import java.io.Reader;
-import java.util.*;
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.NoSuchElementException;
+import java.util.Objects;
 
 import javax.xml.namespace.QName;
-import javax.xml.stream.*;
+import javax.xml.stream.Location;
+import javax.xml.stream.XMLStreamConstants;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
 
 import org.apache.jena.atlas.web.ContentType;
 import org.apache.jena.datatypes.RDFDatatype;
 import org.apache.jena.datatypes.xsd.XSDDatatype;
-import org.apache.jena.graph.*;
-import org.apache.jena.riot.*;
+import org.apache.jena.graph.Node;
+import org.apache.jena.graph.NodeFactory;
+import org.apache.jena.graph.Node_Marker;
+import org.apache.jena.graph.Triple;
+import org.apache.jena.riot.Lang;
+import org.apache.jena.riot.ReaderRIOT;
+import org.apache.jena.riot.ReaderRIOTFactory;
+import org.apache.jena.riot.RiotException;
 import org.apache.jena.riot.system.ErrorHandler;
 import org.apache.jena.riot.system.ParserProfile;
 import org.apache.jena.riot.system.StreamRDF;
 import org.apache.jena.riot.writer.StreamWriterTriX;
 import org.apache.jena.riot.writer.WriterTriX;
 import org.apache.jena.sparql.core.Quad;
-import org.apache.jena.sparql.resultset.ResultSetException;
 import org.apache.jena.sparql.util.Context;
+import org.apache.jena.util.JenaXMLInput;
 import org.apache.jena.vocabulary.RDF;
 
 /** Read TriX.
@@ -82,21 +94,19 @@ public class ReaderTriX implements ReaderRIOT {
 
     @Override
     public void read(InputStream in, String baseURI, ContentType ct, StreamRDF output, Context context) {
-        XMLInputFactory xf = XMLInputFactory.newInstance();
         XMLStreamReader xReader;
         try {
-            xReader = xf.createXMLStreamReader(in);
+            xReader = JenaXMLInput.newXMLStreamReader(in);
         } catch (XMLStreamException e) { throw new RiotException("Can't initialize StAX parsing engine", e); }
         read(xReader,  baseURI, output);
     }
 
     @Override
     public void read(Reader reader, String baseURI, ContentType ct, StreamRDF output, Context context) {
-        XMLInputFactory xf = XMLInputFactory.newInstance();
         XMLStreamReader xReader;
         try {
-            xReader = xf.createXMLStreamReader(reader);
-        } catch (XMLStreamException e) { throw new ResultSetException("Can't initialize StAX parsing engine", e); }
+            xReader = JenaXMLInput.newXMLStreamReader(reader);
+        } catch (XMLStreamException e) { throw new RiotException("Can't initialize StAX parsing engine", e); }
         read(xReader,  baseURI, output);
     }
 
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/resultset/rw/ResultsStAX.java b/jena-arq/src/main/java/org/apache/jena/riot/resultset/rw/ResultsStAX.java
index 295c8cc..7af4cf1 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/resultset/rw/ResultsStAX.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/resultset/rw/ResultsStAX.java
@@ -26,7 +26,6 @@ import java.util.NoSuchElementException;
 import java.util.Objects;
 
 import javax.xml.namespace.QName;
-import javax.xml.stream.XMLInputFactory;
 import javax.xml.stream.XMLStreamConstants;
 import javax.xml.stream.XMLStreamException;
 import javax.xml.stream.XMLStreamReader;
@@ -52,31 +51,32 @@ import org.apache.jena.sparql.graph.GraphFactory;
 import org.apache.jena.sparql.resultset.ResultSetException;
 import org.apache.jena.sparql.resultset.SPARQLResult;
 import org.apache.jena.sparql.util.Context;
+import org.apache.jena.util.JenaXMLInput;
 
 /** Public only for use by XMLOutput (legacy) */
 public class ResultsStAX implements ResultSet, Closeable {
     public static SPARQLResult read(InputStream in, Model model, Context context) {
-        XMLInputFactory xf = XMLInputFactory.newInstance() ;
+        XMLStreamReader xReader;
         try {
-            XMLStreamReader xReader = xf.createXMLStreamReader(in) ;
-            return worker(xReader, model, context);
+            xReader = JenaXMLInput.newXMLStreamReader(in);
         } catch (XMLStreamException e) {
             throw new ResultSetException("Can't initialize StAX parsing engine", e) ;
         } catch (Exception ex) {
             throw new ResultSetException("Failed when initializing the StAX parsing engine", ex) ;
         }
+        return worker(xReader, model, context);
     }
 
     public static SPARQLResult read(Reader in, Model model, Context context) {
-        XMLInputFactory xf = XMLInputFactory.newInstance() ;
+        XMLStreamReader xReader;
         try {
-            XMLStreamReader xReader = xf.createXMLStreamReader(in) ;
-            return worker(xReader, model, context) ;
+            xReader = JenaXMLInput.newXMLStreamReader(in);
         } catch (XMLStreamException e) {
             throw new ResultSetException("Can't initialize StAX parsing engine", e) ;
         } catch (Exception ex) {
             throw new ResultSetException("Failed when initializing the StAX parsing engine", ex) ;
         }
+        return worker(xReader, model, context);
     }
 
     private static SPARQLResult worker(XMLStreamReader xReader, Model model, Context context) {
@@ -108,9 +108,6 @@ public class ResultsStAX implements ResultSet, Closeable {
     private boolean         askResult        = false;
 
     private ResultsStAX(XMLStreamReader reader, Model model, Context context) {
-
-
-
         parser = reader ;
         this.model = model ;
         boolean inputGraphBNodeLabels = (context != null) && context.isTrue(ARQ.inputGraphBNodeLabels);
diff --git a/jena-core/src/main/java/org/apache/jena/rdfxml/xmlinput/impl/RDFXMLParser.java b/jena-core/src/main/java/org/apache/jena/rdfxml/xmlinput/impl/RDFXMLParser.java
index 77cd22c..9d11fbb 100644
--- a/jena-core/src/main/java/org/apache/jena/rdfxml/xmlinput/impl/RDFXMLParser.java
+++ b/jena-core/src/main/java/org/apache/jena/rdfxml/xmlinput/impl/RDFXMLParser.java
@@ -23,21 +23,18 @@ import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.UTFDataFormatException;
 
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-
 import org.apache.jena.rdfxml.xmlinput.FatalParsingErrorException ;
 import org.apache.jena.rdfxml.xmlinput.SAX2RDF ;
 import org.apache.jena.shared.JenaException;
 import org.apache.jena.util.CharEncoding ;
+import org.apache.jena.util.JenaXMLInput;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 import org.xml.sax.SAXParseException;
 import org.xml.sax.XMLReader;
 
 /**
- * 
- * The main parser, other variants of XMLHandler are for more specialized purposes.
+ * The main RDFXML parser, other variants of XMLHandler are for more specialized purposes.
  */
 public class RDFXMLParser extends XMLHandler {
 
@@ -72,12 +69,9 @@ public class RDFXMLParser extends XMLHandler {
         return saxParser;
     }
 
-    private static SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
     public static RDFXMLParser create() {
-        try { 
-            SAXParser saxParser = saxParserFactory.newSAXParser();
-            // Get the encapsulated SAX XMLReader
-            XMLReader xmlreader = saxParser.getXMLReader();
+        try {
+            XMLReader xmlreader = JenaXMLInput.createXMLReader();
             RDFXMLParser a = new RDFXMLParser(xmlreader);
             // Default.
             a.setEncoding("UTF");
@@ -99,13 +93,13 @@ public class RDFXMLParser extends XMLHandler {
         initEncodingChecks(input);
         try {
             saxParser.parse(input);
-        } 
+        }
         catch (UTFDataFormatException e) {
                 generalError(ERR_UTF_ENCODING, e);
-        } 
+        }
         catch (IOException e) {
                 generalError(ERR_GENERIC_IO, e);
-        } 
+        }
         catch (WrappedException wrapped) {
             wrapped.throwMe();
         }
diff --git a/jena-core/src/main/java/org/apache/jena/util/JenaXMLInput.java b/jena-core/src/main/java/org/apache/jena/util/JenaXMLInput.java
new file mode 100644
index 0000000..298d693
--- /dev/null
+++ b/jena-core/src/main/java/org/apache/jena/util/JenaXMLInput.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.util;
+
+import java.io.InputStream;
+import java.io.Reader;
+
+import javax.xml.XMLConstants;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+
+import org.apache.jena.atlas.logging.Log;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+/**
+ * Create XML input methods.
+ * External DTD processing is disabled and will be silently ignored to prevent
+ * <a href="https://owasp.org/www-community/vulnerabilities/XML_External_Entity_(XXE)_Processing">XXE Processing</a>
+ * problems.
+ */
+public class JenaXMLInput {
+    // ---- SAX
+    // RDFXMLParser
+    private static SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
+
+    public static XMLReader createXMLReader() throws ParserConfigurationException, SAXException {
+            SAXParser saxParser = saxParserFactory.newSAXParser();
+            XMLReader xmlreader = saxParser.getXMLReader();
+
+            // XXE : either disable all DTD processing ...
+//            // EFFECT: RIOT Error if DTD.
+//            xmlreader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
+//            // This may not be strictly required as DTDs shouldn't be allowed at all, per previous line.
+//            xmlreader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
+
+            // ... just ignore external DTDs (silently ignore)
+            xmlreader.setFeature("http://xml.org/sax/features/external-general-entities", false);
+            xmlreader.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
+            return xmlreader;
+    }
+
+    // ---- StAX
+    // TriX and results.
+    private static XMLInputFactory xf = XMLInputFactory.newInstance() ;
+    static {
+        try {
+    //      // This disables DTDs entirely for that factory
+    //      xf.setProperty(XMLInputFactory.SUPPORT_DTD, false);
+          // disable external entities (silently ignore)
+          xf.setProperty("javax.xml.stream.isSupportingExternalEntities", false);
+      } catch(IllegalArgumentException ex){
+          Log.error(JenaXMLInput.class, "Problem setting StAX property", ex);
+      }
+    }
+
+    public static XMLStreamReader newXMLStreamReader(InputStream in) throws XMLStreamException {
+        return xf.createXMLStreamReader(in) ;
+    }
+
+    public static XMLStreamReader newXMLStreamReader(Reader in) throws XMLStreamException {
+        return xf.createXMLStreamReader(in) ;
+    }
+
+    // ---- DocumentBuilder
+    // For reference - not used in Jena src/main, but is in src/test DOM2RDFTest and MoreDOM2RDFTest
+    public static DocumentBuilderFactory newDocumentBuilderFactory() throws ParserConfigurationException {
+        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+        // Causes SAXParseException if there is an external entity.
+        factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
+        return factory;
+    }
+}
diff --git a/jena-core/src/test/java/org/apache/jena/rdfxml/xmlinput/DOM2RDFTest.java b/jena-core/src/test/java/org/apache/jena/rdfxml/xmlinput/DOM2RDFTest.java
index de63fc9..fa00b81 100644
--- a/jena-core/src/test/java/org/apache/jena/rdfxml/xmlinput/DOM2RDFTest.java
+++ b/jena-core/src/test/java/org/apache/jena/rdfxml/xmlinput/DOM2RDFTest.java
@@ -27,6 +27,7 @@ import javax.xml.parsers.ParserConfigurationException;
 
 import org.apache.jena.rdf.model.Model ;
 import org.apache.jena.shared.JenaException ;
+import org.apache.jena.util.JenaXMLInput;
 import org.w3c.dom.Document;
 import org.xml.sax.SAXException;
 
@@ -40,34 +41,32 @@ class DOM2RDFTest extends SAX2RDFTest {
 	public DOM2RDFTest(String dir, String base0, String file) {
 		super(dir, base0, file);
 	}
-	
-	static private DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-    // DOM must have namespace information inside it!
-	static { factory.setNamespaceAware(true);}
+
 	static private DocumentBuilder domParser;
-	
+
 	static {
 		try {
-		domParser = factory.newDocumentBuilder();
+		    DocumentBuilderFactory factory = JenaXMLInput.newDocumentBuilderFactory();
+		    factory.setNamespaceAware(true);
+		    domParser = factory.newDocumentBuilder();
 		}
 		catch (ParserConfigurationException rte){
 			throw new JenaException(rte);
 		}
 	}
-	
 
 	@Override
     void loadXMLModel(Model m2, InputStream in, RDFEHArray eh2) throws SAXException, IOException {
-		
+
 		Document document = domParser
 				.parse(in,base);
-			
+
 		// Make DOM into transformer input
 //		Source input = new DOMSource(document);
-        DOM2Model d2m = DOM2Model.createD2M(base,m2);	
+        DOM2Model d2m = DOM2Model.createD2M(base,m2);
 
 		d2m.setErrorHandler(eh2);
-		
+
 //		try {
 			try {
 		        d2m.load(document);
@@ -77,7 +76,7 @@ class DOM2RDFTest extends SAX2RDFTest {
 //		} catch (SAXParseException e) {
 //			// already reported, leave it be.
 //		}
-		
+
 
 	}
 
diff --git a/jena-core/src/test/java/org/apache/jena/rdfxml/xmlinput/MoreDOM2RDFTest.java b/jena-core/src/test/java/org/apache/jena/rdfxml/xmlinput/MoreDOM2RDFTest.java
index 93ef1a7..4379a96 100644
--- a/jena-core/src/test/java/org/apache/jena/rdfxml/xmlinput/MoreDOM2RDFTest.java
+++ b/jena-core/src/test/java/org/apache/jena/rdfxml/xmlinput/MoreDOM2RDFTest.java
@@ -27,6 +27,8 @@ import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
 
 import junit.framework.TestCase;
+import org.apache.jena.shared.JenaException;
+import org.apache.jena.util.JenaXMLInput;
 import org.w3c.dom.Document;
 import org.xml.sax.SAXException;
 
@@ -37,38 +39,36 @@ public class MoreDOM2RDFTest extends TestCase implements StatementHandler {
 	public MoreDOM2RDFTest(String name) {
 		super(name);
 	}
-	
-	static private DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-    // DOM must have namespace information inside it!
-	static { factory.setNamespaceAware(true);}
-	static private DocumentBuilder domParser;
-	
-	static {
-		try {
-		domParser = factory.newDocumentBuilder();
-		}
-		catch (ParserConfigurationException rte){
-			throw new RuntimeException(rte);
-		}
-	}
-	
+
+    static private DocumentBuilder domParser;
+
+    static {
+        try {
+            DocumentBuilderFactory factory = JenaXMLInput.newDocumentBuilderFactory();
+            factory.setNamespaceAware(true);
+            domParser = factory.newDocumentBuilder();
+        }
+        catch (ParserConfigurationException rte){
+            throw new JenaException(rte);
+        }
+    }
 
 	public void testDOMwithARP() throws SAXException, IOException {
-		
+
         InputStream in = new FileInputStream("testing/wg/Class/conclusions001.rdf");
 		Document document = domParser
 				.parse(in,"http://www.example.org/");
-			
-		DOM2Model d2m = DOM2Model.createD2M("http://www.example.org/",null);	
+
+		DOM2Model d2m = DOM2Model.createD2M("http://www.example.org/",null);
 
 		d2m.getHandlers().setStatementHandler(this);
-		
+
 			try {
 		        d2m.load(document);
 			} finally {
 				d2m.close();
 			}
-		
+
          assertEquals("Incorrect number of triples",3,count);
 
 	}
@@ -77,14 +77,14 @@ public class MoreDOM2RDFTest extends TestCase implements StatementHandler {
     @Override
     public void statement(AResource subj, AResource pred, AResource obj) {
         count++;
-        
+
     }
 
 
     @Override
     public void statement(AResource subj, AResource pred, ALiteral lit) {
         count++;
-        
+
     }
 
 }