You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@daffodil.apache.org by GitBox <gi...@apache.org> on 2018/07/17 12:09:18 UTC

[GitHub] stevedlawrence closed pull request #81: Reduce memory usage regressions in commit 07ee2434bb

stevedlawrence closed pull request #81: Reduce memory usage regressions in commit 07ee2434bb
URL: https://github.com/apache/incubator-daffodil/pull/81
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/daffodil-cli/src/main/scala/org/apache/daffodil/Main.scala b/daffodil-cli/src/main/scala/org/apache/daffodil/Main.scala
index 55e99e876..f12476e28 100644
--- a/daffodil-cli/src/main/scala/org/apache/daffodil/Main.scala
+++ b/daffodil-cli/src/main/scala/org/apache/daffodil/Main.scala
@@ -968,30 +968,21 @@ object Main extends Logging {
 
             val infosetType = performanceOpts.infosetType.toOption.get
 
-            val dataSeq = files.map { filePath =>
+            val dataSeq: Seq[Either[AnyRef, Array[Byte]]] = files.map { filePath =>
               val input = (new FileInputStream(filePath))
               val dataSize = filePath.length()
               val fileContent = new Array[Byte](dataSize.toInt)
               input.read(fileContent) // For performance testing, we want everything in memory so as to remove I/O from consideration.
               val data = performanceOpts.unparse() match {
-                case true => infosetDataToInputterData(infosetType, fileContent)
-                case false => fileContent
+                case true => Left(infosetDataToInputterData(infosetType, fileContent))
+                case false => Right(fileContent)
               }
               data
             }
 
             val inputs = (0 until performanceOpts.number()).map { n =>
               val index = n % dataSeq.length
-              val data = dataSeq(index)
-              val inData = performanceOpts.unparse() match {
-                case true => {
-                  Left(data)
-                }
-                case false => {
-                  Right(InputSourceDataInputStream(data.asInstanceOf[Array[Byte]]))
-                }
-              }
-              inData
+              dataSeq(index)
             }
             val inputsWithIndex = inputs.zipWithIndex
 
@@ -1022,9 +1013,10 @@ object Main extends Logging {
                         val inputterForUnparse = getInfosetInputter(infosetType, anyRef)
                         processor.unparse(inputterForUnparse, nullChannelForUnparse)
                       })
-                      case Right(dis) => Timer.getTimeResult({
+                      case Right(data) => Timer.getTimeResult({
+                        val input = InputSourceDataInputStream(data)
                         val outputterForParse = getInfosetOutputter(infosetType, nullWriterForParse)
-                        processor.parse(dis, outputterForParse)
+                        processor.parse(input, outputterForParse)
                       })
                     }
 
diff --git a/daffodil-io/src/main/scala/org/apache/daffodil/io/InputSourceDataInputStream.scala b/daffodil-io/src/main/scala/org/apache/daffodil/io/InputSourceDataInputStream.scala
index 980a201fd..3f093bc64 100644
--- a/daffodil-io/src/main/scala/org/apache/daffodil/io/InputSourceDataInputStream.scala
+++ b/daffodil-io/src/main/scala/org/apache/daffodil/io/InputSourceDataInputStream.scala
@@ -701,15 +701,16 @@ class InputSourceDataInputStreamCharIteratorState  {
   }
 
   def assignFrom(other: InputSourceDataInputStreamCharIteratorState) {
-    // threadCheck()
-    // TODO: we want to avoid this duplication, we should really only duplicate
-    // when we creating a mark. When we reset, we just need to assign
-    // decodedChars = other.decodedChars. This API doesn't currently
-    // differentiate between saving a mark and resetting a mark
-    this.decodedChars = other.decodedChars.duplicate
-    this.bitPositions = other.bitPositions.duplicate
-    this.moreDataAvailable = other.moreDataAvailable
-    this.bitPositionAtLastFetch0b = other.bitPositionAtLastFetch0b
+    // We are intentionally not saving/restoring any state here. This is
+    // because saving state requires duplicating the long and charbuffers,
+    // which is fairly expensive and can use up alot of memory, especially when
+    // there are lots of points of uncertainties. Instead, the
+    // checkNeedsRefetch method will determine if something changed the
+    // bitPosition (e.g. resetting a mark), and just clear all the internal
+    // state, leading to a redecode of data. This does mean that if we reset
+    // back to a mark we will repeat work that has already been done. But that
+    // should be relatively fast, avoids memory usage, and only takes a penalty
+    // when backtracking rather than every time there is a PoU.
   }
 }
 
@@ -734,9 +735,11 @@ class InputSourceDataInputStreamCharIterator(dis: InputSourceDataInputStream) ex
     val cis = dis.cst.charIteratorState
     if (cis.bitPositionAtLastFetch0b != dis.bitPos0b) {
       // Something outside of the char iterator  moved the bit position since
-      // the last time we fetched data. In this case, all of our cached decoded
-      // data is wrong. So lets clear the iterator state and reset the last
-      // fetch state so that more data will be fetched
+      // the last time we fetched data (e.g. backtracking to a previous mark,
+      // in which we do not save/restore this iterator state data). In this
+      // case, all of our cached decoded data is wrong. So lets clear the
+      // iterator state and reset the last fetch state so that more data will
+      // be fetched
       dis.cst.charIteratorState.clear()
       dis.cst.charIteratorState.bitPositionAtLastFetch0b = dis.bitPos0b
     }
diff --git a/daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/Runtime.scala b/daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/Runtime.scala
index 8687c8f05..fa115586e 100644
--- a/daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/Runtime.scala
+++ b/daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/Runtime.scala
@@ -17,6 +17,17 @@
 
 package org.apache.daffodil.processors
 
+import java.io.File
+import java.io.ObjectOutputStream
+import java.nio.channels.Channels
+import java.nio.CharBuffer
+import java.nio.LongBuffer
+import java.util.zip.GZIPOutputStream
+
+import org.xml.sax.ErrorHandler
+import org.xml.sax.SAXException
+import org.xml.sax.SAXParseException
+
 import org.apache.daffodil.Implicits._; object INoWarn4 { ImplicitsSuppressUnusedImportWarning() }
 import org.apache.daffodil.equality._; object EqualityNoWarn3 { EqualitySuppressUnusedImportWarning() }
 import org.apache.daffodil.api.WithDiagnostics
@@ -27,18 +38,13 @@ import org.apache.daffodil.api.DFDL
 import org.apache.daffodil.api.WithDiagnostics
 import org.apache.daffodil.api.DFDL
 import org.apache.daffodil.util.Validator
-import org.xml.sax.SAXParseException
 import org.apache.daffodil.api.ValidationMode
 import org.apache.daffodil.externalvars.ExternalVariablesLoader
-import java.io.File
 import org.apache.daffodil.externalvars.Binding
-import java.nio.channels.Channels
 import org.apache.daffodil.util.Maybe
 import org.apache.daffodil.util.Maybe._
-import java.io.ObjectOutputStream
 import org.apache.daffodil.util.Logging
 import org.apache.daffodil.debugger.Debugger
-import java.util.zip.GZIPOutputStream
 import org.apache.daffodil.processors.unparsers.UState
 import org.apache.daffodil.infoset.InfosetInputter
 import org.apache.daffodil.processors.unparsers.UnparseError
@@ -47,8 +53,6 @@ import org.apache.daffodil.events.MultipleEventHandler
 import org.apache.daffodil.io.DirectOrBufferedDataOutputStream
 import org.apache.daffodil.io.InputSourceDataInputStream
 import org.apache.daffodil.util.LogLevel
-import org.xml.sax.ErrorHandler
-import org.xml.sax.SAXException
 import org.apache.daffodil.io.BitOrderChangeException
 import org.apache.daffodil.infoset._
 import org.apache.daffodil.processors.parsers.ParseError
@@ -96,6 +100,20 @@ class DataProcessor(val ssrd: SchemaSetRuntimeData)
 
   protected var tunablesObj = ssrd.tunable // Compiler-set tunables
 
+  // This thread local state is used by the PState when it needs buffers for
+  // regex matching. This cannot be in PState because a PState does not last
+  // beyond a single parse, but we want to share this among different parses to
+  // avoid large memory allocations. The alternative is to use a ThreadLocal
+  // companion object, but that would have not access to tunables, so one could
+  // not configure the size of the regex match buffers.
+  @transient lazy val regexMatchState = new ThreadLocal[(CharBuffer, LongBuffer)] {
+    override def initialValue = {
+      val cb = CharBuffer.allocate(tunablesObj.maximumRegexMatchLengthInCharacters)
+      val lb = LongBuffer.allocate(tunablesObj.maximumRegexMatchLengthInCharacters)
+      (cb, lb)
+    }
+  }
+
   def setValidationMode(mode: ValidationMode.Type): Unit = { ssrd.validationMode = mode }
   def getValidationMode() = ssrd.validationMode
   def getVariables = ssrd.variables
diff --git a/daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/parsers/PState.scala b/daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/parsers/PState.scala
index e5e8b4dae..205f2f3c7 100644
--- a/daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/parsers/PState.scala
+++ b/daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/parsers/PState.scala
@@ -18,8 +18,6 @@
 package org.apache.daffodil.processors.parsers
 
 import java.nio.channels.Channels
-import java.nio.CharBuffer
-import java.nio.LongBuffer
 
 import scala.Right
 import scala.collection.mutable
@@ -322,9 +320,7 @@ final class PState private (
     }
   }
 
-  // TODO: this should come from a thread safe pool or something so that we do not allocate it for every parse
-  override lazy val regexMatchBuffer = CharBuffer.allocate(tunable.maximumRegexMatchLengthInCharacters)
-  override lazy val regexMatchBitPositionBuffer = LongBuffer.allocate(tunable.maximumRegexMatchLengthInCharacters)
+  override lazy val (regexMatchBuffer, regexMatchBitPositionBuffer) = dataProcArg.regexMatchState.get
 }
 
 object PState {


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services