You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@daffodil.apache.org by sl...@apache.org on 2022/01/24 14:00:25 UTC

[daffodil] branch main updated: Add a global TDML CompileResult cache

This is an automated email from the ASF dual-hosted git repository.

slawrence pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/daffodil.git


The following commit(s) were added to refs/heads/main by this push:
     new 4cc0863  Add a global TDML CompileResult cache
4cc0863 is described below

commit 4cc0863e715b79a2bc0f19bc09c5c251305a9596
Author: Steve Lawrence <sl...@apache.org>
AuthorDate: Fri Jan 21 14:30:15 2022 -0500

    Add a global TDML CompileResult cache
    
    We previosly only cache CompileResults on individual DFDLTestSuites.
    This meant that we could compiled schemas within a single test suite,
    but not between tests suites. This could lead to lots of duplicate
    schema compilation if multiple .tdml files referenced the same schema
    
    To avoid this, CompileResults are now stored in a global WeakHashMap
    instead of the DFDLTestSuite. And DFDLTestSuites are changed to maintain
    strong references to the keys of the CompileResult they use. This
    ensures that as long as a DFDLTestSuite exists, so will all of its
    CompileResults, so tests within a DFDLTestSuite can share them. And
    after the DFDLTestSuite is garbage collected, as long as those weak
    references aren't also garbage collected, other DFDLTestSuites will be
    able to use them, and help to minimize compilation same schemas used in
    different TDML files.
    
    DAFFODIL-2627
---
 .../org/apache/daffodil/tdml/TDMLRunner.scala      | 126 ++++++++++++++-------
 .../tdml/processor/TDMLDFDLProcessor.scala         |   8 +-
 .../tdml/processor/Runtime2TDMLDFDLProcessor.scala |  10 +-
 3 files changed, 90 insertions(+), 54 deletions(-)

diff --git a/daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala b/daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
index d26fbf5..c20a694 100644
--- a/daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
+++ b/daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/TDMLRunner.scala
@@ -461,63 +461,43 @@ class DFDLTestSuite private[tdml] (
   }
 
   /**
-   * The CompileResult cache is stored in the DFDLTestSuite instance. This
-   * means any tests run from the same instance can share the same compiled
-   * processor. Once the DFDLTestSuite goes out of scope and is garbage
-   * collected, so too will be the compiled processors. Running the tests again
-   * will require creating a new DFDLTestSuite instance, and thus will
-   * recompile the schema.
+   * Get the CompileResult from from the TDMLCompileResultCache.
    */
-  private val tdmlCompileResultCache = mutable.HashMap[TDMLCompileResultCacheKey, TDML.CompileResult]()
-
-  case class TDMLCompileResultCacheKey (
-    impl: String,
-    suppliedSchema: DaffodilSchemaSource,
-    useSerializedProcessor: Boolean,
-    optRootName: Option[String],
-    optRootNamespace: Option[String],
-    tunables: Map[String, String],
-  )
-
-  // Multiple test cases could use the same schema, and because they can be
-  // executed in parallel, could request the same CompileResult from different
-  // threads. To ensure we only compile these schemas once, we cache the
-  // compile results inside this DFDLTestSuite. So any tests run from the same
-  // test suite can share compile results. To deal with potential thread races
-  // and two threads trying to compile the same schema in parallel, we
-  // synchronize this function on this DFDLTestSuite.
   def getCompileResult(
     impl: AbstractTDMLDFDLProcessorFactory,
     suppliedSchema: DaffodilSchemaSource,
     useSerializedProcessor: Boolean,
     optRootName: Option[String],
     optRootNamespace: Option[String],
-    tunables: Map[String, String]): TDML.CompileResult = this.synchronized {
-
-    val key = TDMLCompileResultCacheKey(
-      impl.implementationName,
+    tunables: Map[String, String]): TDML.CompileResult = {
+
+    // The TDMLCompileResultCache will return two things: 1) the CompileResult
+    // and 2) the cache key associated with the CompileResult. If we do not
+    // maintain a strong reference to the returned cache key, then the
+    // CompileResult could be evicted from the cache when the garbage collector
+    // runs. This means subsequent tests in this same DFDLTestSuite may need to
+    // recompile the same DaffodilSchemaSource. To avoid this, we add the cache
+    // key to a member list in this class. This way, as long as this
+    // DFDLTestSuite exists, compiled results from it will also exist and can
+    // be shared in different tests in this suite. And even once this
+    // DFDLTestSuite goes out of scope and is garbage collected, if the weak
+    // references in the cache aren't garbage collected, other DFDLTestSuites
+    // will also be able to use the CompileResult and minimize compilation.
+    val (key, result) = TDMLCompileResultCache.getCompileResult(
+      impl,
       suppliedSchema,
       useSerializedProcessor,
       optRootName,
       optRootNamespace,
       tunables)
-    val compileResult: TDML.CompileResult = {
-      tdmlCompileResultCache.getOrElseUpdate(key, {
-        impl.getProcessor(
-          suppliedSchema,
-          useSerializedProcessor,
-          optRootName,
-          optRootNamespace,
-          tunables)
-      })
-    }
-    compileResult
+    tdmlCompileResultCacheKeys += key
+    result
   }
 
+  private val tdmlCompileResultCacheKeys = mutable.Set[TDMLCompileResultCache.Key]()
+
   def cleanUp(): Unit = {
-    tdmlCompileResultCache.values.foreach { compileRes =>
-      compileRes.map { _._2.cleanUp() }
-    }
+    tdmlCompileResultCacheKeys.clear()
   }
 
 }
@@ -2604,3 +2584,65 @@ object UTF8Encoder {
   }
 
 }
+
+object TDMLCompileResultCache {
+
+  case class Key (
+    impl: String,
+    suppliedSchema: DaffodilSchemaSource,
+    useSerializedProcessor: Boolean,
+    optRootName: Option[String],
+    optRootNamespace: Option[String],
+    tunables: Map[String, String],
+  )
+
+  private val cache = new mutable.WeakHashMap[Key, TDML.CompileResult]()
+
+  /**
+   * Get a compile result. This might come from a cache of compiled results, or
+   * it might compile a new processor if it hasn't been cached or has been
+   * removed from the cache.
+   *
+   * The CompileResult is guaranteed to not be evicted from cache as long as
+   * there exists a strong reference to the returned key. For this reason, it
+   * is important that callers must maintain a strong reference to the key as
+   * long as they want the CompileResult to stay in the cache.
+   */
+  def getCompileResult(
+    impl: AbstractTDMLDFDLProcessorFactory,
+    suppliedSchema: DaffodilSchemaSource,
+    useSerializedProcessor: Boolean,
+    optRootName: Option[String],
+    optRootNamespace: Option[String],
+    tunables: Map[String, String]): (Key, TDML.CompileResult) = this.synchronized {
+
+    val tmpKey = Key(
+      impl.implementationName,
+      suppliedSchema,
+      useSerializedProcessor,
+      optRootName,
+      optRootNamespace,
+      tunables)
+
+    val maybeKeyVal = cache.find { case (k, v) => k == tmpKey }
+    if (maybeKeyVal.isDefined) {
+      // the key is already in the cache, throw away our temp key and return
+      // the actual key and value that is in the WeakHashMap
+      maybeKeyVal.get
+    } else {
+      // the key either was never added to the cache, or it was garbage
+      // collected from the WeakHashMap. Get the processor, add it to the
+      // cache, and return the key/value
+      val processor = impl.getProcessor(
+        suppliedSchema,
+        useSerializedProcessor,
+        optRootName,
+        optRootNamespace,
+        tunables)
+      val kv = (tmpKey, processor)
+      cache += kv
+      kv
+    }
+  }
+
+}
diff --git a/daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/processor/TDMLDFDLProcessor.scala b/daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/processor/TDMLDFDLProcessor.scala
index 857d058..b6e40c9 100644
--- a/daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/processor/TDMLDFDLProcessor.scala
+++ b/daffodil-tdml-lib/src/main/scala/org/apache/daffodil/tdml/processor/TDMLDFDLProcessor.scala
@@ -119,13 +119,7 @@ trait TDMLDFDLProcessor {
 
   def unparse(parseResult: TDMLParseResult, outStream: java.io.OutputStream): TDMLUnparseResult
 
-  /**
-   * The TDMLRunner may cache the TDMLDFDLProcessor so that it can be reused
-   * among multiple parse/unparse calls without needing to rebuild the
-   * processor. This function is called once the TDMLRunner is done with the
-   * TMLDFDLProcessor. This allows state such as temporary files to be cleaned
-   * up. Actually does nothing unless overridden by an implementation.
-   */
+  @deprecated("Function no longer called, cleanup must be handled in individual TDMLResults.", "3.3.0")
   def cleanUp(): Unit = { /* Do nothing */ }
 }
 
diff --git a/daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/processor/Runtime2TDMLDFDLProcessor.scala b/daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/processor/Runtime2TDMLDFDLProcessor.scala
index 756b6e7..72cb7ac 100644
--- a/daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/processor/Runtime2TDMLDFDLProcessor.scala
+++ b/daffodil-tdml-processor/src/main/scala/org/apache/daffodil/tdml/processor/Runtime2TDMLDFDLProcessor.scala
@@ -113,6 +113,11 @@ final class Runtime2TDMLDFDLProcessorFactory private(
       // Compile the generated code into an executable
       val executable = generator.compileCode(codeDir)
 
+      // There is is no way to clean up TDMLDFDLProcessors, they are just garbage
+      // collected when no longer needed. So recursively mark all generated
+      // files as deleteOnExit so we at least clean them up when the JVM exits
+      os.walk.stream(tempDir).foreach { _.toIO.deleteOnExit() }
+
       // Summarize the result of compiling the schema for the test
       val compileResult = if (generator.isError) {
         Left(generator.getDiagnostics) // C code compilation diagnostics
@@ -193,11 +198,6 @@ class Runtime2TDMLDFDLProcessor(tempDir: os.Path, executable: os.Path)
   def unparse(parseResult: TDMLParseResult, outStream: java.io.OutputStream): TDMLUnparseResult = {
     unparse(parseResult.getResult, outStream)
   }
-
-  /**
-   * Remove the generated executable
-   */
-  override def cleanUp(): Unit = os.remove.all(tempDir)
 }
 
 final class Runtime2TDMLParseResult(pr: ParseResult) extends TDMLParseResult {