You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@daffodil.apache.org by GitBox <gi...@apache.org> on 2020/09/03 14:04:09 UTC

[GitHub] [incubator-daffodil] stevedlawrence commented on a change in pull request #408: Implement SAX Parsing

stevedlawrence commented on a change in pull request #408:
URL: https://github.com/apache/incubator-daffodil/pull/408#discussion_r482931525



##########
File path: daffodil-cli/src/main/scala/org/apache/daffodil/Main.scala
##########
@@ -840,17 +854,29 @@ object Main extends Logging {
               case Some("-") | None => System.out
               case Some(file) => new FileOutputStream(file)
             }
-            val outputter = getInfosetOutputter(parseOpts.infosetType.toOption.get, output)
+            val infosetType = parseOpts.infosetType.toOption.get
+            val eitherOutputterOrHandler = getInfosetOutputter(infosetType, output)
 
             var lastParseBitPosition = 0L
             var keepParsing = true
             var error = false
 
             while (keepParsing) {
 
-              outputter.reset() // reset in case we are streaming
-
-              val parseResult = Timer.getResult("parsing", processor.parse(inStream, outputter))
+              val parseResult = eitherOutputterOrHandler match {
+                case Right(saxContentHandler) =>
+                  saxContentHandler.reset() // reset in case we are streaming
+                  val saxXmlRdr = processor.newXMLReaderInstance
+                  saxXmlRdr.setContentHandler(saxContentHandler)
+                  saxXmlRdr.setErrorHandler(new DaffodilSAXErrorHandler)
+                  saxXmlRdr.setProperty("BlobDirectory", blobDir)
+                  saxXmlRdr.setProperty("BlobSuffix", blobSuffix)
+                  Timer.getResult("parsing", saxXmlRdr.parse(input))
+                  saxXmlRdr.getProperty("org.apache.daffodil.processors.ParseResult").asInstanceOf[ParseResult]

Review comment:
       According to the SAX API, property names are supposed to be URI's. Presumably this avoids conflicts where different readers have different uses for the same property names. The Daffodil URN is "urn:ogf:dfdl:2013:imp:daffodil.apache.org:2018", so it probably makes sense to append some stuff to that for these sax properties. Maybe something like this:
   ```
   urn:ogf:dfdl:2013:imp:daffodil.apache.org:2018:sax:blobDirectory
   urn:ogf:dfdl:2013:imp:daffodil.apache.org:2018:sax:blobSuffix
   urn:ogf:dfdl:2013:imp:daffodil.apache.org:2018:sax:parseResult
   ```

##########
File path: daffodil-cli/src/main/scala/org/apache/daffodil/Main.scala
##########
@@ -1037,8 +1066,11 @@ object Main extends Logging {
                       })
                       case Right(data) => Timer.getTimeResult({
                         val input = InputSourceDataInputStream(data)
-                        val outputterForParse = getInfosetOutputter(infosetType, nullOutputStreamForParse)
-                        processor.parse(input, outputterForParse)
+                        val eitherOutputterOrHandlerForParse = getInfosetOutputter(infosetType, nullOutputStreamForParse)
+                        eitherOutputterOrHandlerForParse match {
+                          case Left(outputter) => processor.parse(input, outputter)
+                          case Right(saxContentHandler) => Assert.nyi("SAX Parse is not yet implemented")

Review comment:
       This is important to implement since this is likely the best way to determine if there are any differences between sax and not sax.

##########
File path: daffodil-cli/src/main/scala/org/apache/daffodil/Main.scala
##########
@@ -417,6 +423,7 @@ class CLIConf(arguments: Array[String]) extends scallop.ScallopConf(arguments)
       case (Some("json"), _) => Right(Unit)
       case (Some("jdom"), _) => Right(Unit)
       case (Some("w3cdom"), _) => Right(Unit)
+      case (Some("sax"), _) => Right(Unit)

Review comment:
       This should error if the unparse flag is set. We can remove that error once that is implemented.

##########
File path: daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
##########
@@ -798,10 +798,12 @@ object XMLUtils {
 
   class XMLDifferenceException(message: String) extends Exception(message)
 
-  def compareAndReport(expected: Node, actual: Node, ignoreProcInstr: Boolean = true) = {
+  def compareAndReport(expected: Node, actual: Node, ignoreProcInstr: Boolean = true,
+    checkPrefixes: Boolean = false, checkNamespaces: Boolean = false) = {

Review comment:
       I'm not sure how strict we are about this, but the Scala style guide says that if you need to wrap a long list of parameters, then each should be on it's own line. Same goes for when calling a function with a bunch of parameters like below.

##########
File path: daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/DataProcessor.scala
##########
@@ -718,3 +748,196 @@ class UnparseResult(dp: DataProcessor, ustate: UState)
     encodingInfo.knownEncodingName
   }
 }
+
+class DaffodilXMLReader(dp: DataProcessor) extends XMLReader with DFDL.DaffodilXMLReader {

Review comment:
       Don't need to extend XMLReader here if DaffodilXMLReader already does. That would make things a little cleaner.

##########
File path: daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/DataProcessor.scala
##########
@@ -718,3 +748,196 @@ class UnparseResult(dp: DataProcessor, ustate: UState)
     encodingInfo.knownEncodingName
   }
 }
+
+class DaffodilXMLReader(dp: DataProcessor) extends XMLReader with DFDL.DaffodilXMLReader {
+  private var _contentHandler: ContentHandler = _
+  private var _errorHandler: ErrorHandler = _
+  private var _dtdHandler: DTDHandler = _
+  private var _entityResolver: EntityResolver = _
+  val SAXNamespaceFeature = "http://xml.org/sax/features/namespaces"
+  val SAXNamespacePrefixFeature = "http://xml.org/sax/features/namespace-prefixes"
+  val SAXParseResultProperty = "org.apache.daffodil.processors.ParseResult"
+
+  private var _propertyMap = mutable.Map[String, AnyRef]()
+  private val _featureMap = mutable.Map[String, Boolean](SAXNamespaceFeature -> false,
+    SAXNamespacePrefixFeature -> false)
+
+  override def getFeature(name: String): Boolean = {
+    name match {
+      case SAXNamespaceFeature | SAXNamespacePrefixFeature => _featureMap(name)
+      case _ => {
+        throw new SAXNotRecognizedException("Only namespace and namespace features are supported for SAX parsing")
+        false
+      }
+    }
+  }
+
+  override def setFeature(name: String, value: Boolean): Unit = {
+    name match {
+      case SAXNamespaceFeature | SAXNamespacePrefixFeature =>
+        _featureMap(name) = value
+      case _ => {
+        throw new SAXNotRecognizedException("Only namespace and namespace features are supported for SAX parsing")
+      }
+    }
+  }
+
+  override def getProperty(name: String): AnyRef = {
+    _propertyMap.getOrElse(name,
+      throw new SAXNotSupportedException(s"SAX Property '${name}' cannot be retrieved"))
+  }
+
+  override def setProperty(name: String, value: AnyRef): Unit = {
+    _propertyMap += (name -> value)
+  }

Review comment:
       The SAX API says:
   > The property name is any fully-qualified URI. It is possible for an XMLReader to recognize a property name but to be unable to change the current value. Some property values may be immutable or mutable only in specific contexts, such as before, during, or after a parse.
   >
   > XMLReaders are not required to recognize setting any specific property names, though a core set is defined by SAX2.
   
   So I don't think we should allow setting random properties. We also aren't doing any validation on those property values (e.g. that blobDir is actually a string). I would reccommend that we have a unique variable for the proprties (e.g. ``val blobDir: Path``) that can be set rather than a Map, and we validate here before setting those variables. Then we we actually need the properties, it's an easy and type safe variable access instead of a map lookup to an AnyRef

##########
File path: daffodil-cli/src/main/scala/org/apache/daffodil/Main.scala
##########
@@ -450,7 +457,7 @@ class CLIConf(arguments: Array[String]) extends scallop.ScallopConf(arguments)
     val vars = props[String]('D', keyName = "variable", valueName = "value", descr = "variables to be used when unparsing. An optional namespace may be provided.")
     val tunables = props[String]('T', keyName = "tunable", valueName = "value", descr = "daffodil tunable to be used when parsing.")
     val config = opt[String](short = 'c', argName = "file", descr = "path to file containing configuration items.")
-    val infosetType = opt[String](short = 'I', argName = "infoset_type", descr = "infoset type to unparse. Must be one of 'xml', 'scala-xml', 'json', 'jdom', or 'w3cdom'.", default = Some("xml")).map { _.toLowerCase }
+    val infosetType = opt[String](short = 'I', argName = "infoset_type", descr = "infoset type to unparse. Must be one of 'xml', 'scala-xml', 'json', 'jdom', 'w3cdom', or 'sax'.", default = Some("xml")).map { _.toLowerCase }

Review comment:
       This doesn't suport unparsing yet.

##########
File path: daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
##########
@@ -841,21 +844,31 @@ Differences were (path, expected, actual):
     maybeIndex: Option[Int],
     parentPathSteps: Seq[String],
     ignoreProcInstr: Boolean,
+    checkPrefixes: Boolean,
+    checkNamespaces: Boolean,
     maybeType: Option[String]): Seq[(String, String, String)] = {
     lazy val zPath = parentPathSteps.reverse.mkString("/")
     (an, bn) match {
       case (a: Elem, b: Elem) => {
-        val Elem(_, labelA, attribsA, _, childrenA @ _*) = a
-        val Elem(_, labelB, attribsB, _, childrenB @ _*) = b
+        val Elem(prefixA, labelA, attribsA, nsbA, childrenA @ _*) = a
+        val Elem(prefixB, labelB, attribsB, nsbB, childrenB @ _*) = b
         val typeA: Option[String] = a.attribute(XSI_NAMESPACE.toString, "type").map(_.head.text)
         val typeB: Option[String] = b.attribute(XSI_NAMESPACE.toString, "type").map(_.head.text)
         val maybeType: Option[String] = Option(typeA.getOrElse(typeB.getOrElse(null)))
         val nilledA = a.attribute(XSI_NAMESPACE.toString, "nil")
         val nilledB = b.attribute(XSI_NAMESPACE.toString, "nil")
+        val nsbACompare = nsbA.toString().trim.split(" ").sorted
+        val nsbBCompare = nsbB.toString().trim.split(" ").sorted
 
         if (labelA != labelB) {
           // different label
           List((zPath, labelA, labelB))
+        } else if (checkPrefixes && prefixA != prefixB) {
+          // different prefix
+          List((zPath, prefixA, prefixB))
+        } else if (checkNamespaces && !(nsbACompare sameElements nsbBCompare)) {

Review comment:
       We generally avoid this space syntax in scala. We prefer something like ``nsbACompare.sameElements(nsbBCompare)``.

##########
File path: daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
##########
@@ -841,21 +844,31 @@ Differences were (path, expected, actual):
     maybeIndex: Option[Int],
     parentPathSteps: Seq[String],
     ignoreProcInstr: Boolean,
+    checkPrefixes: Boolean,
+    checkNamespaces: Boolean,
     maybeType: Option[String]): Seq[(String, String, String)] = {
     lazy val zPath = parentPathSteps.reverse.mkString("/")
     (an, bn) match {
       case (a: Elem, b: Elem) => {
-        val Elem(_, labelA, attribsA, _, childrenA @ _*) = a
-        val Elem(_, labelB, attribsB, _, childrenB @ _*) = b
+        val Elem(prefixA, labelA, attribsA, nsbA, childrenA @ _*) = a
+        val Elem(prefixB, labelB, attribsB, nsbB, childrenB @ _*) = b
         val typeA: Option[String] = a.attribute(XSI_NAMESPACE.toString, "type").map(_.head.text)
         val typeB: Option[String] = b.attribute(XSI_NAMESPACE.toString, "type").map(_.head.text)
         val maybeType: Option[String] = Option(typeA.getOrElse(typeB.getOrElse(null)))
         val nilledA = a.attribute(XSI_NAMESPACE.toString, "nil")
         val nilledB = b.attribute(XSI_NAMESPACE.toString, "nil")
+        val nsbACompare = nsbA.toString().trim.split(" ").sorted
+        val nsbBCompare = nsbB.toString().trim.split(" ").sorted

Review comment:
       Worth adding a comment why were are sorting here.

##########
File path: daffodil-lib/src/main/scala/org/apache/daffodil/util/Validator.scala
##########
@@ -42,11 +44,17 @@ object Validator extends NoBindingFactoryAdapter {
 
   def validateXMLSources(schemaFileNames: Seq[String], document: java.io.InputStream, errHandler: ErrorHandler): Unit = {
     val cache = validationSchemaCache.get()
-    val validator = {
+    val validator : javax.xml.validation.Validator = {
       val optCachedValidator = cache.get(schemaFileNames)
       optCachedValidator match {
-        case Some(validator) => {
-          validator.reset()
+        case Some(cachedValidator) => {
+          cachedValidator.reset()
+          // reset takes it back to the original state at the point we call
+          // newValidator. So we need to re-set the features, resolvers,
+          // and handlers
+          val resolver = DFDLCatalogResolver.get
+          val validator : javax.xml.validation.Validator =
+            initializeValidator(cachedValidator, errHandler, resolver)

Review comment:
       Interesting. So does that mean we had a bug where if full validation was enabled and we called ``parse()`` twice that the second time would fail?

##########
File path: daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
##########
@@ -1009,7 +1022,9 @@ Differences were (path, expected, actual):
     dataB: String,
     maybeType: Option[String]): Seq[(String, String, String)] = {
 
-    if (maybeType.isDefined && maybeType.get == "xs:anyURI") computeBlobDiff(zPath, dataA, dataB)
+    if( (maybeType.isDefined && maybeType.get == "xs:anyURI")
+      || Seq(dataA, dataB).forall(_.startsWith("file://")) )

Review comment:
       Worth adding a comment why can't always rely on xsi:type and sometimes use the "file://" heuristic. Might also be worth pulling this condition into one or two vals so that it's easier to read. Something like
   
   ```scala
   val hasBlobType = maybeType.isDefined ...
   val looksLikeBlobURI = ...startsWith(file:///)
   if (hasBlobType || looksLikeBlobURI) ...
   ```
   This way the code is more self documenting.

##########
File path: daffodil-runtime1/src/main/scala/org/apache/daffodil/api/DFDLParserUnparser.scala
##########
@@ -192,6 +196,19 @@ object DFDL {
     def parse(input: InputSourceDataInputStream, output: InfosetOutputter): ParseResult
   }
 
+  trait DaffodilXMLReader {
+    def setContentHandler(ch: org.xml.sax.ContentHandler): Unit
+    def getContentHandler: org.xml.sax.ContentHandler
+    def setErrorHandler(er: org.xml.sax.ErrorHandler): Unit
+    def getErrorHandler: org.xml.sax.ErrorHandler
+    def setProperty(p: String, v: AnyRef): Unit
+    def getProperty(p: String): AnyRef
+    def parse(is: java.io.InputStream): Unit
+    def parse(in: InputSourceDataInputStream): Unit
+    def parse(in: org.xml.sax.InputSource): Unit
+    def parse(ab: Array[Byte]): Unit
+  }
+

Review comment:
       Looks like this is just an XMLReader but with extra parse functions? If so, it might be more clear to do something like this
   ```scala
   trait DaffodilXMLReader extends XMLReader {
     def parse(in: InputSourceDataInputStream): Unit
     def parse(in: org.xml.sax.InputSource): Unit
     def parse(ab: Array[Byte]): Unit
   }
   ```

##########
File path: daffodil-runtime1/src/main/scala/org/apache/daffodil/infoset/SAXInfosetOutputter.scala
##########
@@ -0,0 +1,266 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.daffodil.infoset
+
+import org.apache.daffodil.api.DFDL
+import org.apache.daffodil.dpath.NodeInfo
+import org.apache.daffodil.xml.XMLUtils
+import org.xml.sax.SAXException
+import org.xml.sax.helpers.AttributesImpl
+
+class SAXInfosetOutputter(xmlReader: DFDL.DaffodilXMLReader)
+  extends InfosetOutputter
+  with XMLInfosetOutputter {
+  /**
+   * Reset the internal state of this InfosetOutputter. This should be called
+   * inbetween calls to the parse method.
+   */
+  override def reset(): Unit = {
+    // this doesn't do anything as the ContentHandler API does not support
+    // resetting, but some implemented ContentHandlers, such as the JDOM SaxHandler,
+    // do support resetting so it's up to the creator of the contentHandler, to call
+    // their contentHandler's reset if applicable and if necessary
+  }
+
+  /**
+   * Called by Daffodil internals to signify the beginning of the infoset.
+   *
+   * @return true on sucess, false if there was an error and Daffodil should stop all
+   *         future calls to the InfosetOutputter
+   */
+  override def startDocument(): Boolean = {
+    val contentHandler = xmlReader.getContentHandler
+    try {
+      contentHandler.startDocument()
+      true
+    } catch {
+      case _: SAXException => false
+    }
+  }
+
+  /**
+   * Called by Daffodil internals to signify the end of the infoset.
+   *
+   * @return true on sucess, false if there was an error and Daffodil should stop all
+   *         future calls to the InfosetOutputter
+   */
+  override def endDocument(): Boolean = {
+    val contentHandler = xmlReader.getContentHandler
+    try {
+      contentHandler.endDocument()
+      true
+    } catch {
+      case _: SAXException => false
+    }
+  }
+
+  /**
+   * Called by Daffodil internals to signify the beginning of a simple element.
+   *
+   * @param diSimple the simple element that is started. Various fields of
+   *                 DISimple can be accessed to determine things like the
+   *                 value, nil, name, namespace, etc.
+   * @return true on sucess, false if there was an error and Daffodil should stop all
+   *         future calls to the InfosetOutputter
+   */
+  override def startSimple(diSimple: DISimple): Boolean = {
+    val contentHandler = xmlReader.getContentHandler
+    val ns = if (diSimple.erd.thisElementsNamespace.isNoNamespace) "" else diSimple.erd.thisElementsNamespace.toString
+    val prefix = if(diSimple.erd.thisElementsNamespacePrefix == null) "" else  diSimple.erd.thisElementsNamespacePrefix
+    val elemName = diSimple.erd.name
+    val qName = if (prefix == "") elemName else s"$prefix:$elemName"

Review comment:
       I'm wondering if we even need thisElementsNamespace/Prefix? Shouldn't all that information exist in the namedQname variable? Perhaps we can refactor those variables out and make RuntimeData a little smaller? For example, I think this is equivalent:
   ```scala
   val ns = if (diSimple.erd.nameQname.namespace.isNoNamespace) ...
   val prefix = diSimple.erd.namedQName.prefix.getOrElse("")
   val elemName = disSimple.erd.namedQName.local
   val qName = diSimple.erd.namedQName.toQNameString
   ```

##########
File path: daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/DataProcessor.scala
##########
@@ -718,3 +748,196 @@ class UnparseResult(dp: DataProcessor, ustate: UState)
     encodingInfo.knownEncodingName
   }
 }
+
+class DaffodilXMLReader(dp: DataProcessor) extends XMLReader with DFDL.DaffodilXMLReader {
+  private var _contentHandler: ContentHandler = _
+  private var _errorHandler: ErrorHandler = _
+  private var _dtdHandler: DTDHandler = _
+  private var _entityResolver: EntityResolver = _
+  val SAXNamespaceFeature = "http://xml.org/sax/features/namespaces"
+  val SAXNamespacePrefixFeature = "http://xml.org/sax/features/namespace-prefixes"
+  val SAXParseResultProperty = "org.apache.daffodil.processors.ParseResult"
+

Review comment:
       Can these be private?

##########
File path: daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/DataProcessor.scala
##########
@@ -718,3 +748,196 @@ class UnparseResult(dp: DataProcessor, ustate: UState)
     encodingInfo.knownEncodingName
   }
 }
+
+class DaffodilXMLReader(dp: DataProcessor) extends XMLReader with DFDL.DaffodilXMLReader {
+  private var _contentHandler: ContentHandler = _
+  private var _errorHandler: ErrorHandler = _
+  private var _dtdHandler: DTDHandler = _
+  private var _entityResolver: EntityResolver = _
+  val SAXNamespaceFeature = "http://xml.org/sax/features/namespaces"
+  val SAXNamespacePrefixFeature = "http://xml.org/sax/features/namespace-prefixes"
+  val SAXParseResultProperty = "org.apache.daffodil.processors.ParseResult"
+
+  private var _propertyMap = mutable.Map[String, AnyRef]()
+  private val _featureMap = mutable.Map[String, Boolean](SAXNamespaceFeature -> false,
+    SAXNamespacePrefixFeature -> false)
+
+  override def getFeature(name: String): Boolean = {
+    name match {
+      case SAXNamespaceFeature | SAXNamespacePrefixFeature => _featureMap(name)
+      case _ => {
+        throw new SAXNotRecognizedException("Only namespace and namespace features are supported for SAX parsing")
+        false
+      }
+    }
+  }
+
+  override def setFeature(name: String, value: Boolean): Unit = {
+    name match {
+      case SAXNamespaceFeature | SAXNamespacePrefixFeature =>
+        _featureMap(name) = value
+      case _ => {
+        throw new SAXNotRecognizedException("Only namespace and namespace features are supported for SAX parsing")
+      }
+    }
+  }
+
+  override def getProperty(name: String): AnyRef = {
+    _propertyMap.getOrElse(name,
+      throw new SAXNotSupportedException(s"SAX Property '${name}' cannot be retrieved"))
+  }
+
+  override def setProperty(name: String, value: AnyRef): Unit = {
+    _propertyMap += (name -> value)
+  }
+
+  override def setEntityResolver(resolver: EntityResolver): Unit = {
+    _entityResolver = resolver
+  }
+
+  override def getEntityResolver: EntityResolver = _entityResolver
+
+  override def setDTDHandler(handler: DTDHandler): Unit = {
+    _dtdHandler = handler
+  }
+
+  override def getDTDHandler: DTDHandler = _dtdHandler
+
+  override def setContentHandler(handler: ContentHandler): Unit = {
+    _contentHandler = handler;
+  }
+
+  override def getContentHandler: ContentHandler = _contentHandler
+
+  override def setErrorHandler(handler: ErrorHandler): Unit = {
+    _errorHandler = handler;
+  }
+
+  override def getErrorHandler: ErrorHandler = _errorHandler
+
+  override def parse(input: InputSource): Unit = {
+    val is = input.getByteStream
+    if(is != null) {
+      val isdis = InputSourceDataInputStream(is)
+      val sio = createSAXInfosetOutputter(this)
+      val pr = dp.parse(isdis, sio)
+      handleDiagnostics(pr)
+      setProperty(SAXParseResultProperty, pr)
+    } else {
+      throw new IOException("Inputsource must be backed by Inputstream")
+    }
+  }
+
+  override def parse(systemId: String): Unit = {
+    throw new IOException("SAX parsing of systemId is unsupported")
+  }
+
+  def parse(isdis: InputSourceDataInputStream): Unit = {
+    val sio = createSAXInfosetOutputter(this)
+    val pr = dp.parse(isdis, sio)
+    handleDiagnostics(pr)
+    setProperty(SAXParseResultProperty, pr)
+  }
+
+  def parse(stream: InputStream): Unit = {
+    val isdis = InputSourceDataInputStream(stream)
+    val sio = createSAXInfosetOutputter(this)
+    val pr = dp.parse(isdis, sio)
+    handleDiagnostics(pr)
+    setProperty(SAXParseResultProperty, pr)
+  }
+
+  def parse(arr: Array[Byte]): Unit = {
+    val isdis = InputSourceDataInputStream(arr)
+    val sio = createSAXInfosetOutputter(this)
+    val pr = dp.parse(isdis, sio)
+    handleDiagnostics(pr)
+    setProperty(SAXParseResultProperty, pr)
+  }
+
+  private def handleDiagnostics(pr: DFDL.ParseResult): Unit = {
+    val diagnostics = pr.getDiagnostics
+    if (diagnostics.nonEmpty) {
+      val eh = this.getErrorHandler

Review comment:
       Shoudl check if eh is null. If there is no error handler, then we shouldn't bother trying to create diagnostics.

##########
File path: daffodil-runtime1/src/main/scala/org/apache/daffodil/infoset/SAXInfosetOutputter.scala
##########
@@ -0,0 +1,266 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.daffodil.infoset
+
+import org.apache.daffodil.api.DFDL
+import org.apache.daffodil.dpath.NodeInfo
+import org.apache.daffodil.xml.XMLUtils
+import org.xml.sax.SAXException
+import org.xml.sax.helpers.AttributesImpl
+
+class SAXInfosetOutputter(xmlReader: DFDL.DaffodilXMLReader)
+  extends InfosetOutputter
+  with XMLInfosetOutputter {
+  /**
+   * Reset the internal state of this InfosetOutputter. This should be called
+   * inbetween calls to the parse method.
+   */
+  override def reset(): Unit = {
+    // this doesn't do anything as the ContentHandler API does not support
+    // resetting, but some implemented ContentHandlers, such as the JDOM SaxHandler,
+    // do support resetting so it's up to the creator of the contentHandler, to call
+    // their contentHandler's reset if applicable and if necessary
+  }
+
+  /**
+   * Called by Daffodil internals to signify the beginning of the infoset.
+   *
+   * @return true on sucess, false if there was an error and Daffodil should stop all
+   *         future calls to the InfosetOutputter
+   */
+  override def startDocument(): Boolean = {
+    val contentHandler = xmlReader.getContentHandler
+    try {
+      contentHandler.startDocument()
+      true
+    } catch {
+      case _: SAXException => false
+    }
+  }
+
+  /**
+   * Called by Daffodil internals to signify the end of the infoset.
+   *
+   * @return true on sucess, false if there was an error and Daffodil should stop all
+   *         future calls to the InfosetOutputter
+   */
+  override def endDocument(): Boolean = {
+    val contentHandler = xmlReader.getContentHandler
+    try {
+      contentHandler.endDocument()
+      true
+    } catch {
+      case _: SAXException => false
+    }
+  }
+
+  /**
+   * Called by Daffodil internals to signify the beginning of a simple element.
+   *
+   * @param diSimple the simple element that is started. Various fields of
+   *                 DISimple can be accessed to determine things like the
+   *                 value, nil, name, namespace, etc.
+   * @return true on sucess, false if there was an error and Daffodil should stop all
+   *         future calls to the InfosetOutputter
+   */
+  override def startSimple(diSimple: DISimple): Boolean = {
+    val contentHandler = xmlReader.getContentHandler
+    val ns = if (diSimple.erd.thisElementsNamespace.isNoNamespace) "" else diSimple.erd.thisElementsNamespace.toString
+    val prefix = if(diSimple.erd.thisElementsNamespacePrefix == null) "" else  diSimple.erd.thisElementsNamespacePrefix
+    val elemName = diSimple.erd.name
+    val qName = if (prefix == "") elemName else s"$prefix:$elemName"
+
+    try {
+      val nsbStart = diSimple.erd.minimizedScope
+      val nsbEnd = if (diSimple.isRoot) scala.xml.TopScope else diSimple.diParent.erd.minimizedScope
+      var n = nsbStart
+      while( n != nsbEnd) {
+        val prefix = if(n.prefix == null) "" else n.prefix
+        val uri = if(n.uri == null) "" else n.uri
+        contentHandler.startPrefixMapping(prefix, uri)
+        n = n.parent
+      }
+
+      val attrs = if(isNilled(diSimple)) {
+        createNilAttribute()
+      } else {
+        new AttributesImpl()
+      }
+
+      contentHandler.startElement(ns, elemName, qName, attrs)
+
+      if (diSimple.hasValue)  {
+        val text =
+          if (diSimple.erd.optPrimType.get.isInstanceOf[NodeInfo.String.Kind]) {
+            remapped(diSimple.dataValueAsString)
+          } else {
+            diSimple.dataValueAsString
+          }
+        val arr = text.toCharArray
+        contentHandler.characters(arr,0, arr.length)
+      }
+      true
+    } catch {
+      case _: SAXException  => false
+    }
+  }
+
+  /**
+   * Called by Daffodil internals to signify the end of a simple element.
+   *
+   * @param diSimple the simple element that is ended. Various fields of
+   *                 DISimple can be accessed to determine things like the
+   *                 value, nil, name, namespace, etc.
+   * @return true on sucess, false if there was an error and Daffodil should stop all
+   *         future calls to the InfosetOutputter
+   */
+  override def endSimple(diSimple: DISimple): Boolean = {
+    val contentHandler = xmlReader.getContentHandler
+    val ns = diSimple.erd.thisElementsNamespace
+    val nsb = diSimple.namedQName
+    try {
+      contentHandler.endElement(ns, nsb.local, nsb.toQNameString)
+
+      val nsbStart = diSimple.erd.minimizedScope
+      val nsbEnd = if (diSimple.isRoot) scala.xml.TopScope else diSimple.diParent.erd.minimizedScope
+      var n = nsbStart
+
+      // we store the the prefixes in the same order as we started them so we can
+      // reverse them and end them in the enclosing order
+      var prefixes: Seq[String] = Seq()
+      while( n != nsbEnd) {
+        val prefix = if(n.prefix == null) "" else n.prefix
+        prefixes :+= prefix
+        n = n.parent
+      }
+      prefixes.reverse.foreach(contentHandler.endPrefixMapping)
+      true
+    } catch {
+      case _: SAXException => false
+    }
+  }
+
+  /**
+   * Called by Daffodil internals to signify the beginning of a complex element.
+   *
+   * @param diComplex the complex element that is started. Various fields of
+   *                  DIComplex can be accessed to determine things like the
+   *                  nil, name, namespace, etc.
+   * @return true on sucess, false if there was an error and Daffodil should stop all
+   *         future calls to the InfosetOutputter
+   */
+  override def startComplex(diComplex: DIComplex): Boolean = {
+    val contentHandler = xmlReader.getContentHandler
+    val ns = if (diComplex.erd.thisElementsNamespace.isNoNamespace) "" else diComplex.erd.thisElementsNamespace.toString
+    val prefix = if(diComplex.erd.thisElementsNamespacePrefix == null) "" else  diComplex.erd.thisElementsNamespacePrefix
+    val elemName = diComplex.erd.name
+    val qName = if (prefix == "") elemName else s"$prefix:$elemName"
+
+    try {
+      val nsbStart = diComplex.erd.minimizedScope
+      val nsbEnd = if (diComplex.isRoot) scala.xml.TopScope else diComplex.diParent.erd.minimizedScope
+      var n = nsbStart
+      while( n != nsbEnd) {
+        val prefix = if(n.prefix == null) "" else n.prefix
+        val uri = if(n.uri == null) "" else n.uri
+        contentHandler.startPrefixMapping(prefix, uri)
+        n = n.parent
+      }
+
+      val attrs = if(isNilled(diComplex)) {
+        createNilAttribute()
+      } else {
+        new AttributesImpl()
+      }
+
+      contentHandler.startElement(ns, elemName, qName, attrs)
+      true
+    } catch {
+      case _: SAXException => false
+    }
+  }
+
+  /**
+   * Called by Daffodil internals to signify the end of a complex element.
+   *
+   * @param diComplex the complex element that is ended. Various fields of
+   *                  DIComplex can be accessed to determine things like the
+   *                  nil, name, namespace, etc.
+   * @return true on sucess, false if there was an error and Daffodil should stop all
+   *         future calls to the InfosetOutputter
+   */
+  override def endComplex(diComplex: DIComplex): Boolean = {
+    val contentHandler = xmlReader.getContentHandler
+    val ns = diComplex.erd.thisElementsNamespace
+    val nsb = diComplex.namedQName
+    try {
+      contentHandler.endElement(ns, nsb.local, nsb.toQNameString)
+
+      val nsbStart = diComplex.erd.minimizedScope
+      val nsbEnd = if (diComplex.isRoot) scala.xml.TopScope else diComplex.diParent.erd.minimizedScope
+      var n = nsbStart
+
+      // we store the the prefixes in the same order as we started them so we can
+      // reverse them and end them in the enclosing order
+      var prefixes: Seq[String] = Seq()
+      while( n != nsbEnd) {
+        val prefix = if(n.prefix == null) "" else n.prefix
+        prefixes :+= prefix
+        n = n.parent
+      }
+      prefixes.reverse.foreach(contentHandler.endPrefixMapping)
+      true

Review comment:
       THis also looks the same as in endSimple Same reccommendation here, move this into an endElement function that endSimple and endComplex both call.

##########
File path: daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/DataProcessor.scala
##########
@@ -718,3 +748,196 @@ class UnparseResult(dp: DataProcessor, ustate: UState)
     encodingInfo.knownEncodingName
   }
 }
+
+class DaffodilXMLReader(dp: DataProcessor) extends XMLReader with DFDL.DaffodilXMLReader {
+  private var _contentHandler: ContentHandler = _
+  private var _errorHandler: ErrorHandler = _
+  private var _dtdHandler: DTDHandler = _
+  private var _entityResolver: EntityResolver = _
+  val SAXNamespaceFeature = "http://xml.org/sax/features/namespaces"
+  val SAXNamespacePrefixFeature = "http://xml.org/sax/features/namespace-prefixes"
+  val SAXParseResultProperty = "org.apache.daffodil.processors.ParseResult"
+
+  private var _propertyMap = mutable.Map[String, AnyRef]()
+  private val _featureMap = mutable.Map[String, Boolean](SAXNamespaceFeature -> false,
+    SAXNamespacePrefixFeature -> false)
+
+  override def getFeature(name: String): Boolean = {
+    name match {
+      case SAXNamespaceFeature | SAXNamespacePrefixFeature => _featureMap(name)
+      case _ => {
+        throw new SAXNotRecognizedException("Only namespace and namespace features are supported for SAX parsing")
+        false
+      }
+    }
+  }
+
+  override def setFeature(name: String, value: Boolean): Unit = {
+    name match {
+      case SAXNamespaceFeature | SAXNamespacePrefixFeature =>
+        _featureMap(name) = value
+      case _ => {
+        throw new SAXNotRecognizedException("Only namespace and namespace features are supported for SAX parsing")
+      }
+    }
+  }
+
+  override def getProperty(name: String): AnyRef = {
+    _propertyMap.getOrElse(name,
+      throw new SAXNotSupportedException(s"SAX Property '${name}' cannot be retrieved"))
+  }
+
+  override def setProperty(name: String, value: AnyRef): Unit = {
+    _propertyMap += (name -> value)
+  }
+
+  override def setEntityResolver(resolver: EntityResolver): Unit = {
+    _entityResolver = resolver
+  }
+
+  override def getEntityResolver: EntityResolver = _entityResolver
+
+  override def setDTDHandler(handler: DTDHandler): Unit = {
+    _dtdHandler = handler
+  }
+
+  override def getDTDHandler: DTDHandler = _dtdHandler
+
+  override def setContentHandler(handler: ContentHandler): Unit = {
+    _contentHandler = handler;
+  }
+
+  override def getContentHandler: ContentHandler = _contentHandler
+
+  override def setErrorHandler(handler: ErrorHandler): Unit = {
+    _errorHandler = handler;
+  }
+
+  override def getErrorHandler: ErrorHandler = _errorHandler
+
+  override def parse(input: InputSource): Unit = {
+    val is = input.getByteStream
+    if(is != null) {
+      val isdis = InputSourceDataInputStream(is)
+      val sio = createSAXInfosetOutputter(this)
+      val pr = dp.parse(isdis, sio)
+      handleDiagnostics(pr)
+      setProperty(SAXParseResultProperty, pr)
+    } else {
+      throw new IOException("Inputsource must be backed by Inputstream")
+    }
+  }
+
+  override def parse(systemId: String): Unit = {
+    throw new IOException("SAX parsing of systemId is unsupported")
+  }
+
+  def parse(isdis: InputSourceDataInputStream): Unit = {
+    val sio = createSAXInfosetOutputter(this)
+    val pr = dp.parse(isdis, sio)
+    handleDiagnostics(pr)
+    setProperty(SAXParseResultProperty, pr)
+  }
+
+  def parse(stream: InputStream): Unit = {
+    val isdis = InputSourceDataInputStream(stream)
+    val sio = createSAXInfosetOutputter(this)
+    val pr = dp.parse(isdis, sio)
+    handleDiagnostics(pr)
+    setProperty(SAXParseResultProperty, pr)
+  }
+
+  def parse(arr: Array[Byte]): Unit = {
+    val isdis = InputSourceDataInputStream(arr)
+    val sio = createSAXInfosetOutputter(this)
+    val pr = dp.parse(isdis, sio)
+    handleDiagnostics(pr)
+    setProperty(SAXParseResultProperty, pr)
+  }
+
+  private def handleDiagnostics(pr: DFDL.ParseResult): Unit = {
+    val diagnostics = pr.getDiagnostics
+    if (diagnostics.nonEmpty) {
+      val eh = this.getErrorHandler
+      diagnostics.foreach { d =>
+        val spe = {
+          val msg = d.getMessage()
+          val (lineNo, colNo, systemId) = d.getLocationsInSchemaFiles.headOption.map { s =>
+            val sl = s.asInstanceOf[SchemaFileLocation]
+            val ln = sl.lineNumber.getOrElse("0").toInt
+            val cn = sl.columnNumber.getOrElse("0").toInt
+            val sId = sl.uriString
+            (ln, cn, sId)
+          }.getOrElse((0,0, null))
+
+          val spe = new SAXParseException(msg, null, systemId, lineNo, colNo, d)
+          spe
+        }
+
+        if (d.isError) {
+          eh.error(spe)
+        } else {
+          eh.warning(spe)
+        }
+      }
+    }
+  }
+
+  /**
+   * Creates SAXInfosetOutputter object and attempts to setBlobAttributes on it if
+   * it has at least the blobDirectory property set
+   *
+   * @return SAXInfosetOutputter object with or without blob Attributes set
+   */
+  private def createSAXInfosetOutputter(xmlReader: DaffodilXMLReader): SAXInfosetOutputter = {
+    val sioo = new SAXInfosetOutputter(xmlReader)
+    val siof = try {
+      val blobDir = try {
+        getProperty("BlobDirectory").asInstanceOf[java.nio.file.Path]
+      } catch {
+        case _: SAXNotSupportedException => null
+      }
+      val blobPrefix = try {
+        getProperty("BlobPrefix").asInstanceOf[String]
+      } catch {
+        case _: SAXNotSupportedException => null
+      }
+      val blobSuffix = try {
+        getProperty("BlobSuffix").asInstanceOf[String]
+      } catch {
+        case _: SAXNotSupportedException => null
+      }
+      if (blobDir != null) sioo.setBlobAttributes(blobDir, blobPrefix, blobSuffix)

Review comment:
       We should probably set defaults to the properties, e.g. blobDir should default to ``System.getProperty("java.io.tmpdir")``. That way SAX users can override individual properties and things still work. Probably want to have the same defaults as defined in InofsetOutputter.scala. And then we don't have to worry about if users set a property or not.

##########
File path: daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/DataProcessor.scala
##########
@@ -334,6 +359,11 @@ class DataProcessor private (
 
   override def getDiagnostics = ssrd.diagnostics
 
+  override def newXMLReaderInstance: DFDL.DaffodilXMLReader = new DaffodilXMLReader(this)
+
+  def newContentHandlerInstance(output: OutputStream): DaffodilContentHandler =
+    new DaffodilContentHandler(this, output)
+

Review comment:
       I'd prefer things specific to unparse just not be added in this PR. Makes it easier to see what is specific to unparsing and what isn't. When we get to the unparse PR, i'll want to see this line as an addition. Otherwise Ill probably have forgotten about it.

##########
File path: daffodil-runtime1/src/main/scala/org/apache/daffodil/infoset/SAXInfosetOutputter.scala
##########
@@ -0,0 +1,266 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.daffodil.infoset
+
+import org.apache.daffodil.api.DFDL
+import org.apache.daffodil.dpath.NodeInfo
+import org.apache.daffodil.xml.XMLUtils
+import org.xml.sax.SAXException
+import org.xml.sax.helpers.AttributesImpl
+
+class SAXInfosetOutputter(xmlReader: DFDL.DaffodilXMLReader)
+  extends InfosetOutputter
+  with XMLInfosetOutputter {
+  /**
+   * Reset the internal state of this InfosetOutputter. This should be called
+   * inbetween calls to the parse method.
+   */
+  override def reset(): Unit = {
+    // this doesn't do anything as the ContentHandler API does not support
+    // resetting, but some implemented ContentHandlers, such as the JDOM SaxHandler,
+    // do support resetting so it's up to the creator of the contentHandler, to call
+    // their contentHandler's reset if applicable and if necessary
+  }
+
+  /**
+   * Called by Daffodil internals to signify the beginning of the infoset.
+   *
+   * @return true on sucess, false if there was an error and Daffodil should stop all
+   *         future calls to the InfosetOutputter
+   */
+  override def startDocument(): Boolean = {
+    val contentHandler = xmlReader.getContentHandler
+    try {
+      contentHandler.startDocument()
+      true
+    } catch {
+      case _: SAXException => false
+    }
+  }
+
+  /**
+   * Called by Daffodil internals to signify the end of the infoset.
+   *
+   * @return true on sucess, false if there was an error and Daffodil should stop all
+   *         future calls to the InfosetOutputter
+   */
+  override def endDocument(): Boolean = {
+    val contentHandler = xmlReader.getContentHandler
+    try {
+      contentHandler.endDocument()
+      true
+    } catch {
+      case _: SAXException => false
+    }
+  }
+
+  /**
+   * Called by Daffodil internals to signify the beginning of a simple element.
+   *
+   * @param diSimple the simple element that is started. Various fields of
+   *                 DISimple can be accessed to determine things like the
+   *                 value, nil, name, namespace, etc.
+   * @return true on sucess, false if there was an error and Daffodil should stop all
+   *         future calls to the InfosetOutputter
+   */
+  override def startSimple(diSimple: DISimple): Boolean = {
+    val contentHandler = xmlReader.getContentHandler
+    val ns = if (diSimple.erd.thisElementsNamespace.isNoNamespace) "" else diSimple.erd.thisElementsNamespace.toString
+    val prefix = if(diSimple.erd.thisElementsNamespacePrefix == null) "" else  diSimple.erd.thisElementsNamespacePrefix
+    val elemName = diSimple.erd.name
+    val qName = if (prefix == "") elemName else s"$prefix:$elemName"
+
+    try {
+      val nsbStart = diSimple.erd.minimizedScope
+      val nsbEnd = if (diSimple.isRoot) scala.xml.TopScope else diSimple.diParent.erd.minimizedScope
+      var n = nsbStart
+      while( n != nsbEnd) {
+        val prefix = if(n.prefix == null) "" else n.prefix
+        val uri = if(n.uri == null) "" else n.uri
+        contentHandler.startPrefixMapping(prefix, uri)
+        n = n.parent
+      }
+
+      val attrs = if(isNilled(diSimple)) {
+        createNilAttribute()
+      } else {
+        new AttributesImpl()
+      }
+
+      contentHandler.startElement(ns, elemName, qName, attrs)
+
+      if (diSimple.hasValue)  {
+        val text =
+          if (diSimple.erd.optPrimType.get.isInstanceOf[NodeInfo.String.Kind]) {
+            remapped(diSimple.dataValueAsString)
+          } else {
+            diSimple.dataValueAsString
+          }
+        val arr = text.toCharArray
+        contentHandler.characters(arr,0, arr.length)
+      }
+      true
+    } catch {
+      case _: SAXException  => false
+    }
+  }
+
+  /**
+   * Called by Daffodil internals to signify the end of a simple element.
+   *
+   * @param diSimple the simple element that is ended. Various fields of
+   *                 DISimple can be accessed to determine things like the
+   *                 value, nil, name, namespace, etc.
+   * @return true on sucess, false if there was an error and Daffodil should stop all
+   *         future calls to the InfosetOutputter
+   */
+  override def endSimple(diSimple: DISimple): Boolean = {
+    val contentHandler = xmlReader.getContentHandler
+    val ns = diSimple.erd.thisElementsNamespace
+    val nsb = diSimple.namedQName
+    try {
+      contentHandler.endElement(ns, nsb.local, nsb.toQNameString)
+
+      val nsbStart = diSimple.erd.minimizedScope
+      val nsbEnd = if (diSimple.isRoot) scala.xml.TopScope else diSimple.diParent.erd.minimizedScope
+      var n = nsbStart
+
+      // we store the the prefixes in the same order as we started them so we can
+      // reverse them and end them in the enclosing order
+      var prefixes: Seq[String] = Seq()
+      while( n != nsbEnd) {
+        val prefix = if(n.prefix == null) "" else n.prefix
+        prefixes :+= prefix
+        n = n.parent
+      }
+      prefixes.reverse.foreach(contentHandler.endPrefixMapping)

Review comment:
       I don't think this reversal is necessary. From the ContentHandler AIP:
   > Note that start/endPrefixMapping events are not guaranteed to be properly nested relative to each other: all startPrefixMapping events will occur immediately before the corresponding startElement event, and all endPrefixMapping events will occur immediately after the corresponding endElement event, but their order is not otherwise guaranteed.
   
   So we don't have to worry about the order here. If it matters to the ContentHandler then they need to handle it regardless, so let's avoid doing extra work.

##########
File path: daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/DataProcessor.scala
##########
@@ -718,3 +748,196 @@ class UnparseResult(dp: DataProcessor, ustate: UState)
     encodingInfo.knownEncodingName
   }
 }
+
+class DaffodilXMLReader(dp: DataProcessor) extends XMLReader with DFDL.DaffodilXMLReader {
+  private var _contentHandler: ContentHandler = _
+  private var _errorHandler: ErrorHandler = _
+  private var _dtdHandler: DTDHandler = _
+  private var _entityResolver: EntityResolver = _
+  val SAXNamespaceFeature = "http://xml.org/sax/features/namespaces"
+  val SAXNamespacePrefixFeature = "http://xml.org/sax/features/namespace-prefixes"
+  val SAXParseResultProperty = "org.apache.daffodil.processors.ParseResult"
+
+  private var _propertyMap = mutable.Map[String, AnyRef]()
+  private val _featureMap = mutable.Map[String, Boolean](SAXNamespaceFeature -> false,
+    SAXNamespacePrefixFeature -> false)
+
+  override def getFeature(name: String): Boolean = {
+    name match {
+      case SAXNamespaceFeature | SAXNamespacePrefixFeature => _featureMap(name)
+      case _ => {
+        throw new SAXNotRecognizedException("Only namespace and namespace features are supported for SAX parsing")
+        false
+      }
+    }
+  }
+
+  override def setFeature(name: String, value: Boolean): Unit = {
+    name match {
+      case SAXNamespaceFeature | SAXNamespacePrefixFeature =>
+        _featureMap(name) = value
+      case _ => {
+        throw new SAXNotRecognizedException("Only namespace and namespace features are supported for SAX parsing")
+      }
+    }
+  }
+
+  override def getProperty(name: String): AnyRef = {
+    _propertyMap.getOrElse(name,
+      throw new SAXNotSupportedException(s"SAX Property '${name}' cannot be retrieved"))
+  }
+
+  override def setProperty(name: String, value: AnyRef): Unit = {
+    _propertyMap += (name -> value)
+  }
+
+  override def setEntityResolver(resolver: EntityResolver): Unit = {
+    _entityResolver = resolver
+  }
+
+  override def getEntityResolver: EntityResolver = _entityResolver
+
+  override def setDTDHandler(handler: DTDHandler): Unit = {
+    _dtdHandler = handler
+  }
+
+  override def getDTDHandler: DTDHandler = _dtdHandler
+
+  override def setContentHandler(handler: ContentHandler): Unit = {
+    _contentHandler = handler;
+  }
+
+  override def getContentHandler: ContentHandler = _contentHandler
+
+  override def setErrorHandler(handler: ErrorHandler): Unit = {
+    _errorHandler = handler;
+  }
+
+  override def getErrorHandler: ErrorHandler = _errorHandler
+
+  override def parse(input: InputSource): Unit = {
+    val is = input.getByteStream
+    if(is != null) {
+      val isdis = InputSourceDataInputStream(is)
+      val sio = createSAXInfosetOutputter(this)
+      val pr = dp.parse(isdis, sio)
+      handleDiagnostics(pr)
+      setProperty(SAXParseResultProperty, pr)
+    } else {
+      throw new IOException("Inputsource must be backed by Inputstream")
+    }
+  }
+
+  override def parse(systemId: String): Unit = {
+    throw new IOException("SAX parsing of systemId is unsupported")
+  }
+
+  def parse(isdis: InputSourceDataInputStream): Unit = {
+    val sio = createSAXInfosetOutputter(this)
+    val pr = dp.parse(isdis, sio)
+    handleDiagnostics(pr)
+    setProperty(SAXParseResultProperty, pr)
+  }
+
+  def parse(stream: InputStream): Unit = {
+    val isdis = InputSourceDataInputStream(stream)
+    val sio = createSAXInfosetOutputter(this)
+    val pr = dp.parse(isdis, sio)
+    handleDiagnostics(pr)
+    setProperty(SAXParseResultProperty, pr)
+  }
+
+  def parse(arr: Array[Byte]): Unit = {
+    val isdis = InputSourceDataInputStream(arr)
+    val sio = createSAXInfosetOutputter(this)
+    val pr = dp.parse(isdis, sio)
+    handleDiagnostics(pr)
+    setProperty(SAXParseResultProperty, pr)
+  }

Review comment:
       There's alot of duplicate code to clean up. All the parse() functions can create an InputSourceDataInputStream and then just call the ``parse(isdis: InputSourceDataInputStream)`` function. Then that one function can be the only one that creates the infosetoutputter, calls dp.parse, handles diagnostics, etc. Having one entrypoint to dp.parse and handle errors makes it much easier to make changes in the future.

##########
File path: daffodil-runtime1/src/main/scala/org/apache/daffodil/infoset/SAXInfosetOutputter.scala
##########
@@ -0,0 +1,266 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.daffodil.infoset
+
+import org.apache.daffodil.api.DFDL
+import org.apache.daffodil.dpath.NodeInfo
+import org.apache.daffodil.xml.XMLUtils
+import org.xml.sax.SAXException
+import org.xml.sax.helpers.AttributesImpl
+
+class SAXInfosetOutputter(xmlReader: DFDL.DaffodilXMLReader)
+  extends InfosetOutputter
+  with XMLInfosetOutputter {
+  /**
+   * Reset the internal state of this InfosetOutputter. This should be called
+   * inbetween calls to the parse method.
+   */
+  override def reset(): Unit = {
+    // this doesn't do anything as the ContentHandler API does not support
+    // resetting, but some implemented ContentHandlers, such as the JDOM SaxHandler,
+    // do support resetting so it's up to the creator of the contentHandler, to call
+    // their contentHandler's reset if applicable and if necessary
+  }
+
+  /**
+   * Called by Daffodil internals to signify the beginning of the infoset.
+   *
+   * @return true on sucess, false if there was an error and Daffodil should stop all
+   *         future calls to the InfosetOutputter
+   */
+  override def startDocument(): Boolean = {
+    val contentHandler = xmlReader.getContentHandler
+    try {
+      contentHandler.startDocument()
+      true
+    } catch {
+      case _: SAXException => false
+    }
+  }
+
+  /**
+   * Called by Daffodil internals to signify the end of the infoset.
+   *
+   * @return true on sucess, false if there was an error and Daffodil should stop all
+   *         future calls to the InfosetOutputter
+   */
+  override def endDocument(): Boolean = {
+    val contentHandler = xmlReader.getContentHandler
+    try {
+      contentHandler.endDocument()
+      true
+    } catch {
+      case _: SAXException => false
+    }
+  }
+
+  /**
+   * Called by Daffodil internals to signify the beginning of a simple element.
+   *
+   * @param diSimple the simple element that is started. Various fields of
+   *                 DISimple can be accessed to determine things like the
+   *                 value, nil, name, namespace, etc.
+   * @return true on sucess, false if there was an error and Daffodil should stop all
+   *         future calls to the InfosetOutputter
+   */
+  override def startSimple(diSimple: DISimple): Boolean = {
+    val contentHandler = xmlReader.getContentHandler
+    val ns = if (diSimple.erd.thisElementsNamespace.isNoNamespace) "" else diSimple.erd.thisElementsNamespace.toString
+    val prefix = if(diSimple.erd.thisElementsNamespacePrefix == null) "" else  diSimple.erd.thisElementsNamespacePrefix
+    val elemName = diSimple.erd.name
+    val qName = if (prefix == "") elemName else s"$prefix:$elemName"
+
+    try {
+      val nsbStart = diSimple.erd.minimizedScope
+      val nsbEnd = if (diSimple.isRoot) scala.xml.TopScope else diSimple.diParent.erd.minimizedScope
+      var n = nsbStart
+      while( n != nsbEnd) {
+        val prefix = if(n.prefix == null) "" else n.prefix
+        val uri = if(n.uri == null) "" else n.uri
+        contentHandler.startPrefixMapping(prefix, uri)
+        n = n.parent
+      }
+
+      val attrs = if(isNilled(diSimple)) {
+        createNilAttribute()
+      } else {
+        new AttributesImpl()
+      }
+
+      contentHandler.startElement(ns, elemName, qName, attrs)

Review comment:
       I think everything from here and up is exactly the same as for startComplex. Reccommend pulling that all out into a helper function (e.g. startElement) to reduce code duplication. Note that DISimple and DIComplex both extends DIElement so it can just be something like ``startElement(e: DIElement) ...`` and the code is basically the same.

##########
File path: daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/processor/DaffodilTDMLDFDLProcessor.scala
##########
@@ -274,6 +285,74 @@ class DaffodilTDMLDFDLProcessor private (private var dp: DataProcessor) extends
     new DaffodilTDMLUnparseResult(actual, outStream)
   }
 
+  def doParseWithBothApis(dpInputStream: java.io.InputStream, saxInputStream: java.io.InputStream,
+    lengthLimitInBits: Long): TDMLParseResult = {
+    val outputter = new TDMLInfosetOutputter()
+    outputter.setBlobAttributes(blobDir, blobPrefix, blobSuffix)
+
+    val xri = dp.newXMLReaderInstance
+    xri.setContentHandler(new SAXHandler())
+    xri.setErrorHandler(new DaffodilSAXErrorHandler())
+    xri.setProperty("BlobDirectory", blobDir)
+    xri.setProperty("BlobPrefix", blobPrefix)
+    xri.setProperty("BlobSuffix", blobSuffix)
+
+    val dis = InputSourceDataInputStream(dpInputStream)
+    val sis = InputSourceDataInputStream(saxInputStream)
+    if (lengthLimitInBits >= 0 && lengthLimitInBits % 8 != 0) {
+      // Only set the bit limit if the length is not a multiple of 8. In that
+      // case, we aren't expected to consume all the data and need a bitLimit
+      // to prevent messages about left over bits.
+      dis.setBitLimit0b(MaybeULong(lengthLimitInBits))
+      sis.setBitLimit0b(MaybeULong(lengthLimitInBits))
+    }
+    val actual = dp.parse(dis, outputter)
+    xri.parse(sis)
+    val errorHandler = xri.getErrorHandler.asInstanceOf[DaffodilSAXErrorHandler]

Review comment:
       No need to get it again, we created it up above, just do
   ``scala
   val errorHandler = new DaffodilSAXErrorHandler()
   xri.setErrorHandler(errorHandler)
   ```

##########
File path: daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/DataProcessor.scala
##########
@@ -18,53 +18,78 @@
 package org.apache.daffodil.processors
 
 import java.io.File
+import java.io.IOException
+import java.io.InputStream
 import java.io.ObjectOutputStream
-import java.nio.channels.Channels
+import java.io.OutputStream
 import java.nio.CharBuffer
-import java.nio.file.Files
 import java.nio.LongBuffer
+import java.nio.channels.Channels
+import java.nio.file.Files
 import java.util.zip.GZIPOutputStream
 
-import org.xml.sax.ErrorHandler
-import org.xml.sax.SAXException
-import org.xml.sax.SAXParseException
-import org.apache.daffodil.Implicits._
+import scala.collection.immutable.Queue
+import scala.collection.mutable
 
-import scala.collection.immutable.Queue; object INoWarn4 { ImplicitsSuppressUnusedImportWarning() }
-import org.apache.daffodil.equality._; object EqualityNoWarn3 { EqualitySuppressUnusedImportWarning() }
-import org.apache.daffodil.api.WithDiagnostics
-import org.apache.daffodil.exceptions.Assert
-import org.apache.daffodil.dsom._
 import org.apache.daffodil.ExecutionMode
+import org.apache.daffodil.Implicits._
 import org.apache.daffodil.api.DFDL
-import org.apache.daffodil.api.WithDiagnostics
-import org.apache.daffodil.util.Validator
+import org.apache.daffodil.api.DaffodilTunables
+import org.apache.daffodil.api.Diagnostic
 import org.apache.daffodil.api.ValidationMode
-import org.apache.daffodil.externalvars.ExternalVariablesLoader
-import org.apache.daffodil.externalvars.Binding
-import org.apache.daffodil.util.Maybe
-import org.apache.daffodil.util.Maybe._
-import org.apache.daffodil.util.Logging
+import org.apache.daffodil.api.WithDiagnostics
 import org.apache.daffodil.debugger.Debugger
-import org.apache.daffodil.processors.unparsers.UState
-import org.apache.daffodil.infoset.InfosetInputter
-import org.apache.daffodil.processors.unparsers.UnparseError
-import org.apache.daffodil.oolag.ErrorAlreadyHandled
+import org.apache.daffodil.dsom.TunableLimitExceededError
+import org.apache.daffodil.dsom._
+import org.apache.daffodil.equality._
 import org.apache.daffodil.events.MultipleEventHandler
-import org.apache.daffodil.io.DirectOrBufferedDataOutputStream
-import org.apache.daffodil.io.InputSourceDataInputStream
-import org.apache.daffodil.util.LogLevel
+import org.apache.daffodil.exceptions.Assert
+import org.apache.daffodil.exceptions.SchemaFileLocation
+import org.apache.daffodil.exceptions.UnsuppressableException
+import org.apache.daffodil.externalvars.Binding
+import org.apache.daffodil.externalvars.ExternalVariablesLoader
+import org.apache.daffodil.infoset.DIElement
+import org.apache.daffodil.infoset.InfosetElement
+import org.apache.daffodil.infoset.InfosetException
+import org.apache.daffodil.infoset.InfosetInputter
+import org.apache.daffodil.infoset.InfosetOutputter
+import org.apache.daffodil.infoset.SAXInfosetOutputter
+import org.apache.daffodil.infoset.XMLTextInfosetOutputter
 import org.apache.daffodil.io.BitOrderChangeException
+import org.apache.daffodil.io.DirectOrBufferedDataOutputStream
 import org.apache.daffodil.io.FileIOException
-import org.apache.daffodil.infoset._
+import org.apache.daffodil.io.InputSourceDataInputStream
+import org.apache.daffodil.oolag.ErrorAlreadyHandled
+import org.apache.daffodil.processors.parsers.PState
 import org.apache.daffodil.processors.parsers.ParseError
 import org.apache.daffodil.processors.parsers.Parser
-import org.apache.daffodil.processors.parsers.PState
-import org.apache.daffodil.exceptions.UnsuppressableException
-import org.apache.daffodil.dsom.TunableLimitExceededError
-import org.apache.daffodil.api.DaffodilTunables
-import java.io.IOException
+import org.apache.daffodil.processors.unparsers.UState
+import org.apache.daffodil.processors.unparsers.UnparseError
+import org.apache.daffodil.util.LogLevel
+import org.apache.daffodil.util.Logging
+import org.apache.daffodil.util.Maybe
+import org.apache.daffodil.util.Maybe._
 import org.apache.daffodil.util.Misc
+import org.apache.daffodil.util.Validator
+import org.xml.sax.ContentHandler
+import org.xml.sax.DTDHandler
+import org.xml.sax.EntityResolver
+import org.xml.sax.ErrorHandler
+import org.xml.sax.InputSource
+import org.xml.sax.SAXException
+import org.xml.sax.SAXNotRecognizedException
+import org.xml.sax.SAXNotSupportedException
+import org.xml.sax.SAXParseException
+import org.xml.sax.XMLReader;
+
+
+object INoWarn4 {
+  ImplicitsSuppressUnusedImportWarning()
+}
+
+object EqualityNoWarn3 {
+  EqualitySuppressUnusedImportWarning()
+}

Review comment:
       What are these? Did your IDE do something weird with the imports?

##########
File path: daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/DataProcessor.scala
##########
@@ -718,3 +748,196 @@ class UnparseResult(dp: DataProcessor, ustate: UState)
     encodingInfo.knownEncodingName
   }
 }
+
+class DaffodilXMLReader(dp: DataProcessor) extends XMLReader with DFDL.DaffodilXMLReader {
+  private var _contentHandler: ContentHandler = _
+  private var _errorHandler: ErrorHandler = _
+  private var _dtdHandler: DTDHandler = _
+  private var _entityResolver: EntityResolver = _
+  val SAXNamespaceFeature = "http://xml.org/sax/features/namespaces"
+  val SAXNamespacePrefixFeature = "http://xml.org/sax/features/namespace-prefixes"
+  val SAXParseResultProperty = "org.apache.daffodil.processors.ParseResult"
+
+  private var _propertyMap = mutable.Map[String, AnyRef]()
+  private val _featureMap = mutable.Map[String, Boolean](SAXNamespaceFeature -> false,
+    SAXNamespacePrefixFeature -> false)
+
+  override def getFeature(name: String): Boolean = {
+    name match {
+      case SAXNamespaceFeature | SAXNamespacePrefixFeature => _featureMap(name)
+      case _ => {
+        throw new SAXNotRecognizedException("Only namespace and namespace features are supported for SAX parsing")

Review comment:
       Suggest just erroring with ``"Feature unsupported: " + name``. That way if we add new features we don't have to also update this error message. Same with below error messages.

##########
File path: daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/DataProcessor.scala
##########
@@ -718,3 +748,196 @@ class UnparseResult(dp: DataProcessor, ustate: UState)
     encodingInfo.knownEncodingName
   }
 }
+
+class DaffodilXMLReader(dp: DataProcessor) extends XMLReader with DFDL.DaffodilXMLReader {
+  private var _contentHandler: ContentHandler = _
+  private var _errorHandler: ErrorHandler = _
+  private var _dtdHandler: DTDHandler = _
+  private var _entityResolver: EntityResolver = _
+  val SAXNamespaceFeature = "http://xml.org/sax/features/namespaces"
+  val SAXNamespacePrefixFeature = "http://xml.org/sax/features/namespace-prefixes"
+  val SAXParseResultProperty = "org.apache.daffodil.processors.ParseResult"
+
+  private var _propertyMap = mutable.Map[String, AnyRef]()
+  private val _featureMap = mutable.Map[String, Boolean](SAXNamespaceFeature -> false,
+    SAXNamespacePrefixFeature -> false)
+
+  override def getFeature(name: String): Boolean = {
+    name match {
+      case SAXNamespaceFeature | SAXNamespacePrefixFeature => _featureMap(name)
+      case _ => {
+        throw new SAXNotRecognizedException("Only namespace and namespace features are supported for SAX parsing")
+        false
+      }
+    }
+  }
+
+  override def setFeature(name: String, value: Boolean): Unit = {
+    name match {
+      case SAXNamespaceFeature | SAXNamespacePrefixFeature =>
+        _featureMap(name) = value
+      case _ => {
+        throw new SAXNotRecognizedException("Only namespace and namespace features are supported for SAX parsing")
+      }
+    }
+  }
+
+  override def getProperty(name: String): AnyRef = {
+    _propertyMap.getOrElse(name,
+      throw new SAXNotSupportedException(s"SAX Property '${name}' cannot be retrieved"))
+  }
+
+  override def setProperty(name: String, value: AnyRef): Unit = {
+    _propertyMap += (name -> value)
+  }
+
+  override def setEntityResolver(resolver: EntityResolver): Unit = {
+    _entityResolver = resolver
+  }
+
+  override def getEntityResolver: EntityResolver = _entityResolver
+
+  override def setDTDHandler(handler: DTDHandler): Unit = {
+    _dtdHandler = handler
+  }
+
+  override def getDTDHandler: DTDHandler = _dtdHandler
+
+  override def setContentHandler(handler: ContentHandler): Unit = {
+    _contentHandler = handler;
+  }
+
+  override def getContentHandler: ContentHandler = _contentHandler
+
+  override def setErrorHandler(handler: ErrorHandler): Unit = {
+    _errorHandler = handler;
+  }
+
+  override def getErrorHandler: ErrorHandler = _errorHandler
+
+  override def parse(input: InputSource): Unit = {
+    val is = input.getByteStream
+    if(is != null) {
+      val isdis = InputSourceDataInputStream(is)
+      val sio = createSAXInfosetOutputter(this)
+      val pr = dp.parse(isdis, sio)
+      handleDiagnostics(pr)
+      setProperty(SAXParseResultProperty, pr)
+    } else {
+      throw new IOException("Inputsource must be backed by Inputstream")
+    }
+  }
+
+  override def parse(systemId: String): Unit = {
+    throw new IOException("SAX parsing of systemId is unsupported")
+  }
+
+  def parse(isdis: InputSourceDataInputStream): Unit = {
+    val sio = createSAXInfosetOutputter(this)
+    val pr = dp.parse(isdis, sio)
+    handleDiagnostics(pr)
+    setProperty(SAXParseResultProperty, pr)
+  }
+
+  def parse(stream: InputStream): Unit = {
+    val isdis = InputSourceDataInputStream(stream)
+    val sio = createSAXInfosetOutputter(this)
+    val pr = dp.parse(isdis, sio)
+    handleDiagnostics(pr)
+    setProperty(SAXParseResultProperty, pr)
+  }
+
+  def parse(arr: Array[Byte]): Unit = {
+    val isdis = InputSourceDataInputStream(arr)
+    val sio = createSAXInfosetOutputter(this)
+    val pr = dp.parse(isdis, sio)
+    handleDiagnostics(pr)
+    setProperty(SAXParseResultProperty, pr)
+  }
+
+  private def handleDiagnostics(pr: DFDL.ParseResult): Unit = {
+    val diagnostics = pr.getDiagnostics
+    if (diagnostics.nonEmpty) {
+      val eh = this.getErrorHandler
+      diagnostics.foreach { d =>
+        val spe = {
+          val msg = d.getMessage()
+          val (lineNo, colNo, systemId) = d.getLocationsInSchemaFiles.headOption.map { s =>
+            val sl = s.asInstanceOf[SchemaFileLocation]
+            val ln = sl.lineNumber.getOrElse("0").toInt
+            val cn = sl.columnNumber.getOrElse("0").toInt
+            val sId = sl.uriString
+            (ln, cn, sId)
+          }.getOrElse((0,0, null))
+
+          val spe = new SAXParseException(msg, null, systemId, lineNo, colNo, d)
+          spe
+        }
+
+        if (d.isError) {
+          eh.error(spe)
+        } else {
+          eh.warning(spe)
+        }
+      }
+    }
+  }
+
+  /**
+   * Creates SAXInfosetOutputter object and attempts to setBlobAttributes on it if
+   * it has at least the blobDirectory property set
+   *
+   * @return SAXInfosetOutputter object with or without blob Attributes set
+   */
+  private def createSAXInfosetOutputter(xmlReader: DaffodilXMLReader): SAXInfosetOutputter = {
+    val sioo = new SAXInfosetOutputter(xmlReader)
+    val siof = try {
+      val blobDir = try {
+        getProperty("BlobDirectory").asInstanceOf[java.nio.file.Path]
+      } catch {
+        case _: SAXNotSupportedException => null
+      }
+      val blobPrefix = try {
+        getProperty("BlobPrefix").asInstanceOf[String]
+      } catch {
+        case _: SAXNotSupportedException => null
+      }
+      val blobSuffix = try {
+        getProperty("BlobSuffix").asInstanceOf[String]
+      } catch {
+        case _: SAXNotSupportedException => null
+      }
+      if (blobDir != null) sioo.setBlobAttributes(blobDir, blobPrefix, blobSuffix)
+      sioo
+    } catch {
+      case e: SAXNotSupportedException => sioo
+    }
+    siof
+  }
+}
+
+class DaffodilSAXErrorHandler extends ErrorHandler with WithDiagnostics {
+
+  private var _diagnostics: Seq[Diagnostic] = Nil
+  private var _isError: Boolean = true
+
+  override def warning(exception: SAXParseException): Unit = {
+    _isError = false
+    val embeddedDiagnostic = exception.getCause.asInstanceOf[Diagnostic]
+    _diagnostics :+= embeddedDiagnostic
+  }
+
+  override def error(exception: SAXParseException): Unit = {
+    _isError = true
+    val embeddedDiagnostic = exception.getCause.asInstanceOf[Diagnostic]
+    _diagnostics :+= embeddedDiagnostic
+  }
+
+  override def fatalError(exception: SAXParseException): Unit = {
+    error(exception)
+  }
+
+  override def getDiagnostics: Seq[Diagnostic] = _diagnostics
+
+  override def isError: Boolean = _isError
+}

Review comment:
       Should this be moved the to TDML RUnner and renamed as TDMLSAXErrorHandler? Users of the SAX API need to determine what way is best for them to handle errors. Gathering them up in a list might be a common use, but most likely is not what is appropriate for all users of Daffodil, so shouldn't be called *the* DaffodilErrorHandler.

##########
File path: daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
##########
@@ -876,17 +873,20 @@ case class ParserTestCase(ptc: NodeSeq, parentArg: DFDLTestSuite)
         }
 
         val isErr: Boolean =
-          if (actual.isProcessingError) true
+          if (actual.isProcessingError ) true

Review comment:
       Revert the whitespace changes in this file.

##########
File path: daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/processor/DaffodilTDMLDFDLProcessor.scala
##########
@@ -234,21 +242,24 @@ class DaffodilTDMLDFDLProcessor private (private var dp: DataProcessor) extends
 
   override def getDiagnostics: Seq[Diagnostic] = dp.getDiagnostics
 
-  override def parse(is: java.io.InputStream, lengthLimitInBits: Long): TDMLParseResult = {
-
-    val outputter = new TDMLInfosetOutputter()
-    outputter.setBlobAttributes(blobDir, blobPrefix, blobSuffix)
+  def parse(uri: java.net.URI, lengthLimitInBits: Long): TDMLParseResult = {
+    val url = uri.toURL
+    val dpInputStream = url.openStream()
+    val saxInputStream = url.openStream()
+    doParseWithBothApis(dpInputStream, saxInputStream, lengthLimitInBits)
+  }
 
-    val dis = InputSourceDataInputStream(is)
-    if (lengthLimitInBits >= 0 && lengthLimitInBits % 8 != 0) {
-      // Only set the bit limit if the length is not a multiple of 8. In that
-      // case, we aren't expected to consume all the data and need a bitLimit
-      // to prevent messages about left over bits.
-      dis.setBitLimit0b(MaybeULong(lengthLimitInBits))
-    }
-    val actual = dp.parse(dis, outputter)
+  def parse(arr: Array[Byte], lengthLimitInBits: Long): TDMLParseResult = {
+    val dpInputStream = new ByteArrayInputStream(arr)
+    val saxInputStream = new ByteArrayInputStream(arr)
+    doParseWithBothApis(dpInputStream, saxInputStream, lengthLimitInBits)
+  }
 
-    new DaffodilTDMLParseResult(actual, outputter)
+  override def parse(is: java.io.InputStream, lengthLimitInBits: Long): TDMLParseResult = {
+    val arr = IOUtils.toByteArray(is)
+    val dpInputStream = new ByteArrayInputStream(arr)
+    val saxInputStream = new ByteArrayInputStream(arr)
+    doParseWithBothApis(dpInputStream, saxInputStream, lengthLimitInBits)

Review comment:
       Does anything actually use this? Can it be removed? My concern is that someone might think they are testing parsing a InputStream, but in actuality they copying and testing Byte arrays. Those two behaviors can be very different since once is more stream-y. If not, this should at least call parse(arr, lengthLimitInBits) rather than duplicating ByteArrayInputStream logic.

##########
File path: daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/TDMLInfosetOutputter.scala
##########
@@ -47,6 +48,7 @@ class TDMLInfosetOutputter() extends InfosetOutputter {
   private val w3cdomOut = new W3CDOMInfosetOutputter()
   private val jsonOut = new JsonInfosetOutputter(jsonStream, false)
   private val xmlOut = new XMLTextInfosetOutputter(xmlStream, false)
+  private val ch = new SAXHandler()

Review comment:
       What does ``ch`` stand for? I don't think this Is this even used? Can it be removed?

##########
File path: daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/processor/DaffodilTDMLDFDLProcessor.scala
##########
@@ -274,6 +285,74 @@ class DaffodilTDMLDFDLProcessor private (private var dp: DataProcessor) extends
     new DaffodilTDMLUnparseResult(actual, outStream)
   }
 
+  def doParseWithBothApis(dpInputStream: java.io.InputStream, saxInputStream: java.io.InputStream,
+    lengthLimitInBits: Long): TDMLParseResult = {
+    val outputter = new TDMLInfosetOutputter()
+    outputter.setBlobAttributes(blobDir, blobPrefix, blobSuffix)
+
+    val xri = dp.newXMLReaderInstance
+    xri.setContentHandler(new SAXHandler())
+    xri.setErrorHandler(new DaffodilSAXErrorHandler())
+    xri.setProperty("BlobDirectory", blobDir)
+    xri.setProperty("BlobPrefix", blobPrefix)
+    xri.setProperty("BlobSuffix", blobSuffix)
+
+    val dis = InputSourceDataInputStream(dpInputStream)
+    val sis = InputSourceDataInputStream(saxInputStream)
+    if (lengthLimitInBits >= 0 && lengthLimitInBits % 8 != 0) {
+      // Only set the bit limit if the length is not a multiple of 8. In that
+      // case, we aren't expected to consume all the data and need a bitLimit
+      // to prevent messages about left over bits.
+      dis.setBitLimit0b(MaybeULong(lengthLimitInBits))
+      sis.setBitLimit0b(MaybeULong(lengthLimitInBits))
+    }
+    val actual = dp.parse(dis, outputter)
+    xri.parse(sis)
+    val errorHandler = xri.getErrorHandler.asInstanceOf[DaffodilSAXErrorHandler]
+
+    if( !actual.isError && !errorHandler.isError) {
+      verifySameParseOutput(outputter, xri)
+    }
+    verifySameDiagnostics(actual, xri)
+
+    new DaffodilTDMLParseResult(actual, outputter)
+  }
+
+  def verifySameParseOutput(dpOutputter: TDMLInfosetOutputter, xri: DFDL.DaffodilXMLReader): Unit = {
+    val dpParseOutput = dpOutputter.getResult()
+    val saxParseOutputString = new XMLOutputter()
+      .outputString(xri.getContentHandler.asInstanceOf[SAXHandler].getDocument)
+    val saxParseOutput = scala.xml.XML.loadString(saxParseOutputString)
+
+    try {
+      XMLUtils.compareAndReport(dpParseOutput, saxParseOutput, checkNamespaces = true, checkPrefixes = true)
+    } catch {
+      case e: XMLDifferenceException => {
+        throw TDMLException(
+          """SAX parse output (actual) does not match DataProcessor Parse output (expected)""" +
+            "\n" + e.getMessage, None)
+      }
+    }
+  }
+
+  private def verifySameDiagnostics(
+    actual: DFDL.ParseResult,
+    xri: DFDL.DaffodilXMLReader): Unit = {

Review comment:
       Like above, can this just take the TDMLErrorHandler as a prameter?

##########
File path: daffodil-test/src/test/resources/org/apache/daffodil/section07/variables/variables.tdml
##########
@@ -802,7 +802,7 @@
     <tdml:errors>
       <tdml:error>Schema Definition Error</tdml:error>
       <tdml:error>newVariableInstances must all be distinct within the same scope</tdml:error>
-      <tdml:error>ex:myVar1</tdml:error>
+      <tdml:error>myVar1</tdml:error>
     </tdml:errors>

Review comment:
       We should probably also add some tests for the Java and Scala API's to make sure that people using those API's can actually use this API. I suspect we'll need new wrapper classes for DaffodilXMLReader for japi and sapi like we have for DataProcessor.

##########
File path: daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/processor/DaffodilTDMLDFDLProcessor.scala
##########
@@ -274,6 +285,74 @@ class DaffodilTDMLDFDLProcessor private (private var dp: DataProcessor) extends
     new DaffodilTDMLUnparseResult(actual, outStream)
   }
 
+  def doParseWithBothApis(dpInputStream: java.io.InputStream, saxInputStream: java.io.InputStream,
+    lengthLimitInBits: Long): TDMLParseResult = {
+    val outputter = new TDMLInfosetOutputter()
+    outputter.setBlobAttributes(blobDir, blobPrefix, blobSuffix)
+
+    val xri = dp.newXMLReaderInstance
+    xri.setContentHandler(new SAXHandler())
+    xri.setErrorHandler(new DaffodilSAXErrorHandler())
+    xri.setProperty("BlobDirectory", blobDir)
+    xri.setProperty("BlobPrefix", blobPrefix)
+    xri.setProperty("BlobSuffix", blobSuffix)
+
+    val dis = InputSourceDataInputStream(dpInputStream)
+    val sis = InputSourceDataInputStream(saxInputStream)
+    if (lengthLimitInBits >= 0 && lengthLimitInBits % 8 != 0) {
+      // Only set the bit limit if the length is not a multiple of 8. In that
+      // case, we aren't expected to consume all the data and need a bitLimit
+      // to prevent messages about left over bits.
+      dis.setBitLimit0b(MaybeULong(lengthLimitInBits))
+      sis.setBitLimit0b(MaybeULong(lengthLimitInBits))
+    }
+    val actual = dp.parse(dis, outputter)
+    xri.parse(sis)
+    val errorHandler = xri.getErrorHandler.asInstanceOf[DaffodilSAXErrorHandler]
+
+    if( !actual.isError && !errorHandler.isError) {
+      verifySameParseOutput(outputter, xri)
+    }
+    verifySameDiagnostics(actual, xri)
+
+    new DaffodilTDMLParseResult(actual, outputter)
+  }
+
+  def verifySameParseOutput(dpOutputter: TDMLInfosetOutputter, xri: DFDL.DaffodilXMLReader): Unit = {

Review comment:
       Can this just accept the SAXHandler as the second param? I don't think it uses any other parts of the XMLReader.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org