You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@daffodil.apache.org by GitBox <gi...@apache.org> on 2018/07/17 12:09:18 UTC
[GitHub] stevedlawrence closed pull request #81: Reduce memory usage
regressions in commit 07ee2434bb
stevedlawrence closed pull request #81: Reduce memory usage regressions in commit 07ee2434bb
URL: https://github.com/apache/incubator-daffodil/pull/81
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/daffodil-cli/src/main/scala/org/apache/daffodil/Main.scala b/daffodil-cli/src/main/scala/org/apache/daffodil/Main.scala
index 55e99e876..f12476e28 100644
--- a/daffodil-cli/src/main/scala/org/apache/daffodil/Main.scala
+++ b/daffodil-cli/src/main/scala/org/apache/daffodil/Main.scala
@@ -968,30 +968,21 @@ object Main extends Logging {
val infosetType = performanceOpts.infosetType.toOption.get
- val dataSeq = files.map { filePath =>
+ val dataSeq: Seq[Either[AnyRef, Array[Byte]]] = files.map { filePath =>
val input = (new FileInputStream(filePath))
val dataSize = filePath.length()
val fileContent = new Array[Byte](dataSize.toInt)
input.read(fileContent) // For performance testing, we want everything in memory so as to remove I/O from consideration.
val data = performanceOpts.unparse() match {
- case true => infosetDataToInputterData(infosetType, fileContent)
- case false => fileContent
+ case true => Left(infosetDataToInputterData(infosetType, fileContent))
+ case false => Right(fileContent)
}
data
}
val inputs = (0 until performanceOpts.number()).map { n =>
val index = n % dataSeq.length
- val data = dataSeq(index)
- val inData = performanceOpts.unparse() match {
- case true => {
- Left(data)
- }
- case false => {
- Right(InputSourceDataInputStream(data.asInstanceOf[Array[Byte]]))
- }
- }
- inData
+ dataSeq(index)
}
val inputsWithIndex = inputs.zipWithIndex
@@ -1022,9 +1013,10 @@ object Main extends Logging {
val inputterForUnparse = getInfosetInputter(infosetType, anyRef)
processor.unparse(inputterForUnparse, nullChannelForUnparse)
})
- case Right(dis) => Timer.getTimeResult({
+ case Right(data) => Timer.getTimeResult({
+ val input = InputSourceDataInputStream(data)
val outputterForParse = getInfosetOutputter(infosetType, nullWriterForParse)
- processor.parse(dis, outputterForParse)
+ processor.parse(input, outputterForParse)
})
}
diff --git a/daffodil-io/src/main/scala/org/apache/daffodil/io/InputSourceDataInputStream.scala b/daffodil-io/src/main/scala/org/apache/daffodil/io/InputSourceDataInputStream.scala
index 980a201fd..3f093bc64 100644
--- a/daffodil-io/src/main/scala/org/apache/daffodil/io/InputSourceDataInputStream.scala
+++ b/daffodil-io/src/main/scala/org/apache/daffodil/io/InputSourceDataInputStream.scala
@@ -701,15 +701,16 @@ class InputSourceDataInputStreamCharIteratorState {
}
def assignFrom(other: InputSourceDataInputStreamCharIteratorState) {
- // threadCheck()
- // TODO: we want to avoid this duplication, we should really only duplicate
- // when we creating a mark. When we reset, we just need to assign
- // decodedChars = other.decodedChars. This API doesn't currently
- // differentiate between saving a mark and resetting a mark
- this.decodedChars = other.decodedChars.duplicate
- this.bitPositions = other.bitPositions.duplicate
- this.moreDataAvailable = other.moreDataAvailable
- this.bitPositionAtLastFetch0b = other.bitPositionAtLastFetch0b
+ // We are intentionally not saving/restoring any state here. This is
+ // because saving state requires duplicating the long and charbuffers,
+ // which is fairly expensive and can use up alot of memory, especially when
+ // there are lots of points of uncertainties. Instead, the
+ // checkNeedsRefetch method will determine if something changed the
+ // bitPosition (e.g. resetting a mark), and just clear all the internal
+ // state, leading to a redecode of data. This does mean that if we reset
+ // back to a mark we will repeat work that has already been done. But that
+ // should be relatively fast, avoids memory usage, and only takes a penalty
+ // when backtracking rather than every time there is a PoU.
}
}
@@ -734,9 +735,11 @@ class InputSourceDataInputStreamCharIterator(dis: InputSourceDataInputStream) ex
val cis = dis.cst.charIteratorState
if (cis.bitPositionAtLastFetch0b != dis.bitPos0b) {
// Something outside of the char iterator moved the bit position since
- // the last time we fetched data. In this case, all of our cached decoded
- // data is wrong. So lets clear the iterator state and reset the last
- // fetch state so that more data will be fetched
+ // the last time we fetched data (e.g. backtracking to a previous mark,
+ // in which we do not save/restore this iterator state data). In this
+ // case, all of our cached decoded data is wrong. So lets clear the
+ // iterator state and reset the last fetch state so that more data will
+ // be fetched
dis.cst.charIteratorState.clear()
dis.cst.charIteratorState.bitPositionAtLastFetch0b = dis.bitPos0b
}
diff --git a/daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/Runtime.scala b/daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/Runtime.scala
index 8687c8f05..fa115586e 100644
--- a/daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/Runtime.scala
+++ b/daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/Runtime.scala
@@ -17,6 +17,17 @@
package org.apache.daffodil.processors
+import java.io.File
+import java.io.ObjectOutputStream
+import java.nio.channels.Channels
+import java.nio.CharBuffer
+import java.nio.LongBuffer
+import java.util.zip.GZIPOutputStream
+
+import org.xml.sax.ErrorHandler
+import org.xml.sax.SAXException
+import org.xml.sax.SAXParseException
+
import org.apache.daffodil.Implicits._; object INoWarn4 { ImplicitsSuppressUnusedImportWarning() }
import org.apache.daffodil.equality._; object EqualityNoWarn3 { EqualitySuppressUnusedImportWarning() }
import org.apache.daffodil.api.WithDiagnostics
@@ -27,18 +38,13 @@ import org.apache.daffodil.api.DFDL
import org.apache.daffodil.api.WithDiagnostics
import org.apache.daffodil.api.DFDL
import org.apache.daffodil.util.Validator
-import org.xml.sax.SAXParseException
import org.apache.daffodil.api.ValidationMode
import org.apache.daffodil.externalvars.ExternalVariablesLoader
-import java.io.File
import org.apache.daffodil.externalvars.Binding
-import java.nio.channels.Channels
import org.apache.daffodil.util.Maybe
import org.apache.daffodil.util.Maybe._
-import java.io.ObjectOutputStream
import org.apache.daffodil.util.Logging
import org.apache.daffodil.debugger.Debugger
-import java.util.zip.GZIPOutputStream
import org.apache.daffodil.processors.unparsers.UState
import org.apache.daffodil.infoset.InfosetInputter
import org.apache.daffodil.processors.unparsers.UnparseError
@@ -47,8 +53,6 @@ import org.apache.daffodil.events.MultipleEventHandler
import org.apache.daffodil.io.DirectOrBufferedDataOutputStream
import org.apache.daffodil.io.InputSourceDataInputStream
import org.apache.daffodil.util.LogLevel
-import org.xml.sax.ErrorHandler
-import org.xml.sax.SAXException
import org.apache.daffodil.io.BitOrderChangeException
import org.apache.daffodil.infoset._
import org.apache.daffodil.processors.parsers.ParseError
@@ -96,6 +100,20 @@ class DataProcessor(val ssrd: SchemaSetRuntimeData)
protected var tunablesObj = ssrd.tunable // Compiler-set tunables
+ // This thread local state is used by the PState when it needs buffers for
+ // regex matching. This cannot be in PState because a PState does not last
+ // beyond a single parse, but we want to share this among different parses to
+ // avoid large memory allocations. The alternative is to use a ThreadLocal
+ // companion object, but that would have not access to tunables, so one could
+ // not configure the size of the regex match buffers.
+ @transient lazy val regexMatchState = new ThreadLocal[(CharBuffer, LongBuffer)] {
+ override def initialValue = {
+ val cb = CharBuffer.allocate(tunablesObj.maximumRegexMatchLengthInCharacters)
+ val lb = LongBuffer.allocate(tunablesObj.maximumRegexMatchLengthInCharacters)
+ (cb, lb)
+ }
+ }
+
def setValidationMode(mode: ValidationMode.Type): Unit = { ssrd.validationMode = mode }
def getValidationMode() = ssrd.validationMode
def getVariables = ssrd.variables
diff --git a/daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/parsers/PState.scala b/daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/parsers/PState.scala
index e5e8b4dae..205f2f3c7 100644
--- a/daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/parsers/PState.scala
+++ b/daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/parsers/PState.scala
@@ -18,8 +18,6 @@
package org.apache.daffodil.processors.parsers
import java.nio.channels.Channels
-import java.nio.CharBuffer
-import java.nio.LongBuffer
import scala.Right
import scala.collection.mutable
@@ -322,9 +320,7 @@ final class PState private (
}
}
- // TODO: this should come from a thread safe pool or something so that we do not allocate it for every parse
- override lazy val regexMatchBuffer = CharBuffer.allocate(tunable.maximumRegexMatchLengthInCharacters)
- override lazy val regexMatchBitPositionBuffer = LongBuffer.allocate(tunable.maximumRegexMatchLengthInCharacters)
+ override lazy val (regexMatchBuffer, regexMatchBitPositionBuffer) = dataProcArg.regexMatchState.get
}
object PState {
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
With regards,
Apache Git Services