You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@daffodil.apache.org by sl...@apache.org on 2018/01/06 16:25:34 UTC

[incubator-daffodil] branch master updated: Optimizations with Infoset HashMap lookups/insertions

This is an automated email from the ASF dual-hosted git repository.

slawrence pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-daffodil.git


The following commit(s) were added to refs/heads/master by this push:
     new eec6c76  Optimizations with Infoset HashMap lookups/insertions
eec6c76 is described below

commit eec6c76a36edeecebf87ab5a3d80f9040442f120
Author: Steve Lawrence <sl...@apache.org>
AuthorDate: Fri Jan 5 11:07:33 2018 -0500

    Optimizations with Infoset HashMap lookups/insertions
    
    - Add a new variable to DPathElementCompileInfo that keeps track of
      which path steps are used in an expression. Because the variable is
      set when expression are compiled, which relies on the
      DPathElementCompileInfo already existing, this new variable is a var
      that is toggled during expression compilation. This replaces the
      notReferencedByExpressions in the ElementRuntimeData, which isn't
      actually used, and can't work since the ERD must exist before
      expression compilation.
    - Use this new variable to only insert elements into the Infoset fast
      lookup hash map if that flag is set and thus might be needed in an
      expression. This minimizes the number of hash insertions, greatly
      improving performance.
    - Add a new tunable, allowExternalPathExpressions, which allows one to
      evaluate expression on path steps that were not compiled at schema
      compilation time (e.g. using expressions when debugging). When this
      tunable is true and a path step is not found in the fast lookup hash
      map, a slower linear search is performed to see if the element exists.
      This is slower, but should only occur when debugging/testing. This
      tunable is automatically set when debugging is enabled.
    - Remove the resetChildArray function. This isn't used anywhere and is
      not adequately tested, and it isn't clear what it is supposed to do.
    - In the hashmap, rather than checking if the hashmap containsKey()
      and then calling get() if true, just call get() and check for null for
      existence. This avoids an extra hash lookup.
    - Use += rather than append() to add to an ArrayBuffer. The append() takes a
      vararg, which causes scala to allocate a WrappedArray for a single
      element. Instead, use += which avoids that allocation. Similary,
      instead of the companion object ArrayBuffer(), which uses a vararg
      constructor, just use "new ArrayBuffer()" and append to it using +=,
      also avoiding a WrappedArray allocation.
    - Use reduceToSize rather than dropRight with the ArrayBuffer. dropRight
      is more general and slower than reduceToSize.
    - Make the childNodes array non-lazy. The vast majority of complex
      elements will end up using the array and causing it to be allocated,
      so this removes all the lazy checks to determine if it was initialized
      or not.
    
    DAFFODIL-1860
---
 .../illinois/ncsa/daffodil/dpath/Expression.scala  |   1 +
 .../illinois/ncsa/daffodil/dsom/ElementBase.scala  |   1 -
 .../ncsa/daffodil/infoset/TestInfoset.scala        |   6 +-
 .../infoset/TestInfosetCursorFromReader2.scala     |   1 -
 .../ncsa/daffodil/api/DaffodilTunables.scala       |  12 ++-
 .../ncsa/daffodil/dsom/CompiledExpression1.scala   |   9 ++
 .../ncsa/daffodil/infoset/InfosetImpl.scala        | 117 +++++++++++----------
 .../ncsa/daffodil/processors/Runtime.scala         |   1 +
 .../ncsa/daffodil/processors/RuntimeData.scala     |  10 --
 9 files changed, 86 insertions(+), 72 deletions(-)

diff --git a/daffodil-core/src/main/scala/edu/illinois/ncsa/daffodil/dpath/Expression.scala b/daffodil-core/src/main/scala/edu/illinois/ncsa/daffodil/dpath/Expression.scala
index a1398b6..ce0125b 100644
--- a/daffodil-core/src/main/scala/edu/illinois/ncsa/daffodil/dpath/Expression.scala
+++ b/daffodil-core/src/main/scala/edu/illinois/ncsa/daffodil/dpath/Expression.scala
@@ -1123,6 +1123,7 @@ case class NamedStep(s: String, predArg: Option[PredicateExpression])
       }.getOrElse(die)
       e
     }
+    stepElem.isReferencedByExpressions = true
     stepElem
   }
 }
diff --git a/daffodil-core/src/main/scala/edu/illinois/ncsa/daffodil/dsom/ElementBase.scala b/daffodil-core/src/main/scala/edu/illinois/ncsa/daffodil/dsom/ElementBase.scala
index b35a40c..1602662 100644
--- a/daffodil-core/src/main/scala/edu/illinois/ncsa/daffodil/dsom/ElementBase.scala
+++ b/daffodil-core/src/main/scala/edu/illinois/ncsa/daffodil/dsom/ElementBase.scala
@@ -557,7 +557,6 @@ trait ElementBase
       //
       // unparser specific items
       //
-      false, // !isReferencedByExpressions, // assume it is always to be referenced by expressions
       optTruncateSpecifiedLengthString,
       if (isOutputValueCalc) Some(ovcCompiledExpression) else None,
       maybeBinaryFloatRepEv,
diff --git a/daffodil-core/src/test/scala/edu/illinois/ncsa/daffodil/infoset/TestInfoset.scala b/daffodil-core/src/test/scala/edu/illinois/ncsa/daffodil/infoset/TestInfoset.scala
index a134482..6ab72be 100644
--- a/daffodil-core/src/test/scala/edu/illinois/ncsa/daffodil/infoset/TestInfoset.scala
+++ b/daffodil-core/src/test/scala/edu/illinois/ncsa/daffodil/infoset/TestInfoset.scala
@@ -56,9 +56,11 @@ object TestInfoset {
    * classes.
    */
 
-  def elem2Infoset(erd: ElementRuntimeData, xmlElem: scala.xml.Node, tunable: DaffodilTunables = DaffodilTunables()): InfosetElement = {
+  private val tunableForTests = DaffodilTunables("allowExternalPathExpressions", "true")
+
+  def elem2Infoset(erd: ElementRuntimeData, xmlElem: scala.xml.Node): InfosetElement = {
     val ic = new ScalaXMLInfosetInputter(xmlElem)
-    ic.initialize(erd, tunable)
+    ic.initialize(erd, tunableForTests)
     val aacc = ic.advanceAccessor
     Assert.invariant(ic.advance == true)
     val infosetRootNode = aacc.node
diff --git a/daffodil-core/src/test/scala/edu/illinois/ncsa/daffodil/infoset/TestInfosetCursorFromReader2.scala b/daffodil-core/src/test/scala/edu/illinois/ncsa/daffodil/infoset/TestInfosetCursorFromReader2.scala
index 4932760..11412b0 100644
--- a/daffodil-core/src/test/scala/edu/illinois/ncsa/daffodil/infoset/TestInfosetCursorFromReader2.scala
+++ b/daffodil-core/src/test/scala/edu/illinois/ncsa/daffodil/infoset/TestInfosetCursorFromReader2.scala
@@ -138,7 +138,6 @@ class TestInfosetInputterFromReader2 {
       val arr = bar_s.getChildArray(foo_1_s.runtimeData)
       if (arr.length % 100L =#= 0L) {
         // println("array length is " + arr.length)
-        bar_s.resetChildArray(0)
         foo_arr_s.reduceToSize(0)
       }
       arr.asInstanceOf[DIArray].children
diff --git a/daffodil-lib/src/main/scala/edu/illinois/ncsa/daffodil/api/DaffodilTunables.scala b/daffodil-lib/src/main/scala/edu/illinois/ncsa/daffodil/api/DaffodilTunables.scala
index 2f8b9fa..36f881c 100644
--- a/daffodil-lib/src/main/scala/edu/illinois/ncsa/daffodil/api/DaffodilTunables.scala
+++ b/daffodil-lib/src/main/scala/edu/illinois/ncsa/daffodil/api/DaffodilTunables.scala
@@ -111,7 +111,16 @@ case class DaffodilTunables(
   //
   val initialElementOccurrencesHint: Long = 10,
   val unqualifiedPathStepPolicy: UnqualifiedPathStepPolicy.Type = UnqualifiedPathStepPolicy.NoNamespace,
-  val suppressSchemaDefinitionWarnings: List[WarnID] = Nil)
+  val suppressSchemaDefinitionWarnings: List[WarnID] = Nil,
+
+  // By default, path expressions in Daffodil will only work correctly if path
+  // steps are used in an expression defined in the schema when compiled. To
+  // enable the use of other expressions (e.g. during debugging, where not all
+  // expressions are known at schema compile time), set this tunable to true.
+  // This may cause a degredation of performance in path expression evaluation,
+  // so this should be avoided when in production. This flag is automatically
+  // enabled when debugging is enabled.
+  val allowExternalPathExpressions: Boolean = false)
   extends Serializable
   with Logging
   with DataStreamLimits {
@@ -211,6 +220,7 @@ case class DaffodilTunables(
         }
         this.copy(suppressSchemaDefinitionWarnings = warningsList)
       }
+      case "allowexternalpathexpressions" => this.copy(allowExternalPathExpressions = java.lang.Boolean.valueOf(value))
       case _ => {
         log(LogLevel.Warning, "Ignoring unknown tunable: %s", tunable)
         this
diff --git a/daffodil-runtime1/src/main/scala/edu/illinois/ncsa/daffodil/dsom/CompiledExpression1.scala b/daffodil-runtime1/src/main/scala/edu/illinois/ncsa/daffodil/dsom/CompiledExpression1.scala
index cd24bcf..51f0ed4 100644
--- a/daffodil-runtime1/src/main/scala/edu/illinois/ncsa/daffodil/dsom/CompiledExpression1.scala
+++ b/daffodil-runtime1/src/main/scala/edu/illinois/ncsa/daffodil/dsom/CompiledExpression1.scala
@@ -333,6 +333,15 @@ class DPathElementCompileInfo(
     elementChildrenCompileInfo
   }
 
+  /**
+   * Stores whether or not this element is used in any path step expressions
+   * during schema compilation. Note that this needs to be a var since its
+   * value is determined during DPath compilation, which requires that the
+   * DPathElementCompileInfo already exists. So this must be a mutable value
+   * that can be flipped during schema compilation.
+   */
+  var isReferencedByExpressions = false
+
   override def toString = "DPathElementCompileInfo(%s)".format(name)
 
   @throws(classOf[java.io.IOException])
diff --git a/daffodil-runtime1/src/main/scala/edu/illinois/ncsa/daffodil/infoset/InfosetImpl.scala b/daffodil-runtime1/src/main/scala/edu/illinois/ncsa/daffodil/infoset/InfosetImpl.scala
index 1458114..f73adfe 100644
--- a/daffodil-runtime1/src/main/scala/edu/illinois/ncsa/daffodil/infoset/InfosetImpl.scala
+++ b/daffodil-runtime1/src/main/scala/edu/illinois/ncsa/daffodil/infoset/InfosetImpl.scala
@@ -1323,7 +1323,7 @@ sealed class DIComplex(override val erd: ElementRuntimeData, val tunable: Daffod
     }
   }
 
-  lazy val childNodes = new ArrayBuffer[DINode]
+  val childNodes = new ArrayBuffer[DINode]
   lazy val nameToChildNodeLookup = new HashMap[NamedQName, ArrayBuffer[DINode]]
 
   override lazy val contents: IndexedSeq[DINode] = childNodes
@@ -1335,8 +1335,9 @@ sealed class DIComplex(override val erd: ElementRuntimeData, val tunable: Daffod
   }
 
   final def getChild(info: DPathElementCompileInfo): InfosetElement = {
-    if (nameToChildNodeLookup.containsKey(info.namedQName))
-      nameToChildNodeLookup.get(info.namedQName)(0).asInstanceOf[InfosetElement]
+    val maybeNode = findChild(info.namedQName)
+    if (maybeNode.isDefined)
+      maybeNode.get.asInstanceOf[InfosetElement]
     else
       throw new InfosetNoSuchChildElementException(this, info)
   }
@@ -1349,48 +1350,12 @@ sealed class DIComplex(override val erd: ElementRuntimeData, val tunable: Daffod
 
   final def getChildArray(info: DPathElementCompileInfo): InfosetArray = {
     Assert.usage(info.isArray)
-
-    val name = info.namedQName
-
-    val array = if (nameToChildNodeLookup.containsKey(name)) {
-      val seq = nameToChildNodeLookup.get(name)
-      // Don't support query expressions yet, so should only have
-      // one item in the list
-      //
-      seq(0).asInstanceOf[InfosetArray] //.find(node => node.isInstanceOf[DIArray]).getOrElse(Assert.usageError("not an array")).asInstanceOf[InfosetArray]
-    } else
+    val maybeNode = findChild(info.namedQName)
+    if (maybeNode.isDefined) {
+      maybeNode.get.asInstanceOf[InfosetArray]
+    } else {
       throw new InfosetNoSuchChildElementException(this, info)
-
-    array
-  }
-
-  /**
-   * Used to prune parts of the array away to allow streaming. It allows us
-   * to not keep all of the children of the array in the infoset tree for the
-   * duration of the parse/unparse, once they are no longer needed.
-   */
-  final def resetChildArray(slot: Int) {
-    val numChildrenToRemove = numChildren - slot
-
-    var i = numChildrenToRemove
-
-    while (i > 0) {
-      val childToRemove = childNodes(i)
-      if (nameToChildNodeLookup.containsKey(childToRemove.namedQName)) {
-        val fastSeq = nameToChildNodeLookup.get(childToRemove.namedQName)
-        if (fastSeq.length == 1)
-          // last one, remove the whole key entry
-          nameToChildNodeLookup.remove(childToRemove.namedQName)
-        else
-          // not the last one, just drop the end
-          fastSeq.dropRight(1)
-      } else { /* Does not exist, nothing to do? */ }
-
-      i -= 1
     }
-
-    childNodes.dropRight(numChildrenToRemove)
-    _numChildren = childNodes.length
   }
 
   override def addChild(e: InfosetElement): Unit = {
@@ -1400,16 +1365,20 @@ sealed class DIComplex(override val erd: ElementRuntimeData, val tunable: Daffod
         // no children, or last child is not a DIArray for
         // this element, create the DIArray
         val ia = new DIArray(childERD, this)
-        addChildToFastLookup(ia)
-        childNodes.append(ia)
+        if (childERD.dpathElementCompileInfo.isReferencedByExpressions) {
+          addChildToFastLookup(ia)
+        }
+        childNodes += ia
         _numChildren = childNodes.length
         ia
       }
       // Array is now always last, add the new child to it
       childNodes.last.asInstanceOf[DIArray].append(e)
     } else {
-      childNodes.append(e.asInstanceOf[DINode])
-      addChildToFastLookup(e.asInstanceOf[DINode])
+      if (e.runtimeData.dpathElementCompileInfo.isReferencedByExpressions) {
+        addChildToFastLookup(e.asInstanceOf[DINode])
+      }
+      childNodes += e.asInstanceOf[DINode]
       _numChildren = childNodes.length
     }
     e.setParent(this)
@@ -1417,11 +1386,39 @@ sealed class DIComplex(override val erd: ElementRuntimeData, val tunable: Daffod
 
   def addChildToFastLookup(node: DINode): Unit = {
     val name = node.namedQName
+    val fastSeq = nameToChildNodeLookup.get(name)
+    if (fastSeq != null) {
+      fastSeq += node
+    } else {
+      val ab = new ArrayBuffer[DINode]()
+      ab += node
+      nameToChildNodeLookup.put(name, ab)
+    }
+  }
 
-    if (nameToChildNodeLookup.containsKey(name)) {
-      nameToChildNodeLookup.get(name).append(node)
+  def findChild(qname: NamedQName): Maybe[DINode] = {
+    val fastSeq = nameToChildNodeLookup.get(qname)
+    if (fastSeq != null) {
+      // Daffodil does not support query expressions yet, so there should only
+      // be one item in the list
+      Assert.invariant(fastSeq.length == 1)
+      One(fastSeq(0))
+    } else if (tunable.allowExternalPathExpressions) {
+      // Only DINodes used in expressions defined in the schema are added to
+      // the nameToChildNodeLookup hashmap. If an expression defined outside of
+      // the schema (like via the debugger) attempts to access an element that
+      // was never used in a schema expression, it will never be found. If the
+      // appropriate tunable is set, we will do a linear search to find the
+      // element. Due to the slowness of this, this should only be enabled
+      // during debugging or testing.
+      val found = childNodes.filter(_.erd.namedQName == qname)
+
+      // Daffodil does not support query expressions yet, so there should be at
+      // most one item found
+      Assert.invariant(found.length <= 1)
+      Maybe.toMaybe(found.headOption)
     } else {
-      nameToChildNodeLookup.put(name, ArrayBuffer(node))
+      Nope
     }
   }
 
@@ -1444,15 +1441,17 @@ sealed class DIComplex(override val erd: ElementRuntimeData, val tunable: Daffod
     // this should always be the last element in the hashmap seq
     while (i >= cs._numChildren) {
       val childToRemove = childNodes(i)
-      if (nameToChildNodeLookup.containsKey(childToRemove.namedQName)) {
+      if (childToRemove.erd.dpathElementCompileInfo.isReferencedByExpressions) {
         val fastSeq = nameToChildNodeLookup.get(childToRemove.namedQName)
-        if (fastSeq.length == 1)
+        Assert.invariant(fastSeq != null)
+        if (fastSeq.length == 1) {
           // last one, remove the whole key entry
           nameToChildNodeLookup.remove(childToRemove.namedQName)
-        else
+        } else {
           // not the last one, just drop the end
-          fastSeq.dropRight(1)
-      } else { /* Nothing to do? Doesn't exist. */ }
+          fastSeq.reduceToSize(fastSeq.length - 1)
+        }
+      }
 
       i -= 1
     }
@@ -1507,8 +1506,12 @@ final class DIDocument(erd: ElementRuntimeData, tunable: DaffodilTunables)
     //
     Assert.invariant(childNodes.length == 0)
     val node = child.asInstanceOf[DINode]
-    childNodes.append(node)
-    nameToChildNodeLookup.put(node.namedQName, ArrayBuffer(node))
+    childNodes += node
+    if (node.erd.dpathElementCompileInfo.isReferencedByExpressions) {
+      val ab = new ArrayBuffer[DINode]()
+      ab += node
+      nameToChildNodeLookup.put(node.namedQName, ab)
+    }
     child.setParent(this)
     root = child.asInstanceOf[DIElement]
   }
diff --git a/daffodil-runtime1/src/main/scala/edu/illinois/ncsa/daffodil/processors/Runtime.scala b/daffodil-runtime1/src/main/scala/edu/illinois/ncsa/daffodil/processors/Runtime.scala
index b74e649..1b1f8a1 100644
--- a/daffodil-runtime1/src/main/scala/edu/illinois/ncsa/daffodil/processors/Runtime.scala
+++ b/daffodil-runtime1/src/main/scala/edu/illinois/ncsa/daffodil/processors/Runtime.scala
@@ -141,6 +141,7 @@ class DataProcessor(val ssrd: SchemaSetRuntimeData)
 
   def setDebugging(flag: Boolean) {
     areDebugging_ = flag
+    setTunable("allowExternalPathExpressions", flag.toString)
   }
 
   def setExternalVariables(extVars: Map[String, String]): Unit = {
diff --git a/daffodil-runtime1/src/main/scala/edu/illinois/ncsa/daffodil/processors/RuntimeData.scala b/daffodil-runtime1/src/main/scala/edu/illinois/ncsa/daffodil/processors/RuntimeData.scala
index 713a5b0..827924d 100644
--- a/daffodil-runtime1/src/main/scala/edu/illinois/ncsa/daffodil/processors/RuntimeData.scala
+++ b/daffodil-runtime1/src/main/scala/edu/illinois/ncsa/daffodil/processors/RuntimeData.scala
@@ -662,12 +662,6 @@ final class ElementRuntimeData(
   //
   // Unparser-specific arguments
   //
-  /**
-   * pass true for this if the corresponding infoset element is never
-   * accessed by way of expressions. Enables the element to be dropped
-   * from the infoset immediately after unparsing is complete.
-   */
-  @TransientParam notReferencedByExpressionsArg: => Boolean,
   @TransientParam optTruncateSpecifiedLengthStringArg: => Option[Boolean],
   @TransientParam outputValueCalcExprArg: => Option[CompiledExpression[AnyRef]],
   @TransientParam maybeBinaryFloatRepEvArg: => Maybe[BinaryFloatRepEv],
@@ -710,14 +704,11 @@ final class ElementRuntimeData(
   lazy val namedQName = namedQNameArg
   lazy val impliedRepresentation = impliedRepresentationArg
   lazy val optDefaultValue = optDefaultValueArg
-  lazy val notReferencedByExpressions = notReferencedByExpressionsArg
   lazy val optTruncateSpecifiedLengthString = optTruncateSpecifiedLengthStringArg
   lazy val outputValueCalcExpr = outputValueCalcExprArg
   lazy val maybeBinaryFloatRepEv = maybeBinaryFloatRepEvArg
   lazy val maybeByteOrderEv = maybeByteOrderEvArg
 
-  def isReferencedByExpressions = !notReferencedByExpressions
-
   override def preSerialization: Unit = {
     super.preSerialization
     parent
@@ -750,7 +741,6 @@ final class ElementRuntimeData(
     namedQName
     impliedRepresentation
     optDefaultValue
-    notReferencedByExpressions
     optTruncateSpecifiedLengthString
     outputValueCalcExpr
     maybeBinaryFloatRepEv

-- 
To stop receiving notification emails like this one, please contact
['"commits@daffodil.apache.org" <co...@daffodil.apache.org>'].