You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@daffodil.apache.org by GitBox <gi...@apache.org> on 2022/11/04 17:38:45 UTC

[GitHub] [daffodil] stevedlawrence commented on a diff in pull request #832: Daffodil 2720 wip create new char class entities

stevedlawrence commented on code in PR #832:
URL: https://github.com/apache/daffodil/pull/832#discussion_r1004703978


##########
daffodil-runtime1/src/main/scala/org/apache/daffodil/dsom/CompiledExpression1.scala:
##########
@@ -93,10 +93,10 @@ abstract class CompiledExpression[+T <: AnyRef](
 
   /**
    * Tells us if the expression can match the empty string. We know it can if the expression
-   * is a DFDL entity like %ES; or %WSP*. We do not know whether it can if it is a more
+   * is a DFDL entity like %ES; or %WSP*; or %LSP;. We do not know whether it can if it is a more
    * complicated constant or runtime expression.
    */
-  final lazy val isKnownCanMatchEmptyString = value == "%ES;" || value == "%WSP*;"
+  final lazy val isKnownCanMatchEmptyString = value == "%ES;" || value == "%WSP*;" || value == "%LSP*;"

Review Comment:
   Should this also check for ` %SP*;`, since that also matches the empty string?



##########
daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/DFDLDelimiter.scala:
##########
@@ -86,42 +91,57 @@ class Delimiter {
   // it may not be.
   //
   def reduceDelimBuf(delims: Array[DelimBase]): Array[DelimBase] = {
+    var reducedWSP = reduceWhitespaceDelimiters(delims, "wsp")
+    var reducedLSP = reduceWhitespaceDelimiters(reducedWSP, "lsp")
+    var reducedDelims = reduceWhitespaceDelimiters(reducedLSP, "sp")
+    reducedDelims
+  }
+
+  def reduceWhitespaceDelimiters(delims: Array[DelimBase], delimType: String): Array[DelimBase] = {
 
     val q: Queue[DelimBase] = new Queue[DelimBase]()
 
-    // Counters to keep track of WSP,+,* objects
-    var numWSP: Int = 0
-    var numWSP_Plus: Int = 0
-    var numWSP_Star: Int = 0
+    // Counters to keep track of Delim base,+,* objects
+    var numDelim_Standard: Int = 0
+    var numDelim_Plus: Int = 0
+    var numDelim_Star: Int = 0
 
     var idx: Int = 0 // To index the resultant array
 
     delims.foreach(delim => {
-      delim match {
-        case wsp: WSPDelim => numWSP += 1
-        case wsp: WSPPlusDelim => numWSP_Plus += 1
-        case wsp: WSPStarDelim => numWSP_Star += 1
+      (delimType, delim) match {
+        case ("wsp", _: WSPDelim) => numDelim_Standard += 1
+        case ("wsp", _: WSPPlusDelim) => numDelim_Plus += 1
+        case ("wsp", _: WSPStarDelim) => numDelim_Star += 1
+        case ("lsp", _: LSPDelim) => numDelim_Standard += 1
+        case ("lsp", _: LSPPlusDelim) => numDelim_Plus += 1
+        case ("lsp", _: LSPStarDelim) => numDelim_Star += 1
+        case ("sp", _: SPPlusDelim) => numDelim_Plus += 1
+        case ("sp", _: SPStarDelim) => numDelim_Star += 1

Review Comment:
   I assume this breaks down if a user ever has a separator set to something like `%SP*;%SP;"`, since the single `%SP;` is converted to a space in the entity replacer, so this can't combine the two.
   
   This also breaks down if a user does something like `%WSP*;%SP;`. This won't be combined because this function only combines like delimiters classes. So the `WSP*` will consume all whitespace (including any normal space characters), so the SP+ will never match anything. The same issue happens anything else that WSP consumes.
   
   I wonder if we should even be doing all this reducing of whitespace delimiters? I imaigine the DFDL spec just says that character classes are greedy will match as much as possible, and thus is possible to create delimiters that will never match? I think what you have is fine, but we might want to consider just removing all this delimiter reduction logic.



##########
daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/DFDLDelimiter.scala:
##########
@@ -86,112 +92,183 @@ class Delimiter {
   // it may not be.
   //
   def reduceDelimBuf(delims: Array[DelimBase]): Array[DelimBase] = {
+    var reducedWSP = reduceWhitespaceDelimiters(delims, "wsp")
+    var reducedLSP = reduceWhitespaceDelimiters(reducedWSP, "lsp")
+    var reducedDelims = reduceWhitespaceDelimiters(reducedLSP, "sp")
+    reducedDelims
+  }
 
+  def reduceWhitespaceDelimiters(delims: Array[DelimBase], delimType: String): Array[DelimBase] = {
     val q: Queue[DelimBase] = new Queue[DelimBase]()
 
-    // Counters to keep track of WSP,+,* objects
-    var numWSP: Int = 0
-    var numWSP_Plus: Int = 0
-    var numWSP_Star: Int = 0
+    // Counters to keep track of delim standard,+,* objects
+    var numDelim_Standard: Int = 0
+    var numDelim_Plus: Int = 0
+    var numDelim_Star: Int = 0
 
     var idx: Int = 0 // To index the resultant array
 
     delims.foreach(delim => {
-      delim match {
-        case wsp: WSPDelim => numWSP += 1
-        case wsp: WSPPlusDelim => numWSP_Plus += 1
-        case wsp: WSPStarDelim => numWSP_Star += 1
-        case _ => {
-          // We've reached a non WSP delimiter, check if we've
-          // previously encountered any WSP delimiter objects and
-          // return the equivalent representation (if any)
-          val result = getReducedDelim(numWSP, numWSP_Plus, numWSP_Star)
-
-          if (result.isDefined) {
-            val x = result.get
-            // WSP exists and an equivalent representation was found
-            x.index = idx // Set the delimiter's index
-            q += x
+      if (delimType == "wsp") {
+        delim match {
+          case wsp: WSPDelim => numDelim_Standard += 1
+          case wsp: WSPPlusDelim => numDelim_Plus += 1
+          case wsp: WSPStarDelim => numDelim_Star += 1
+          case _ => {
+            val (newQ, newIdx) = updateQueue(numDelim_Standard, numDelim_Plus, numDelim_Star, q, idx, delimType)
+            q.clear()
+            newQ.foreach(x => q += x)
+
+            idx = newIdx
+            delim.index = idx
+            q += delim
             idx += 1
-          } else {
-            // Reduction not possible, but did we come across
-            // more than one WSP?
-
-            var i = 0
-            while (i < numWSP) {
-              val wsp = new WSPDelim
-              wsp.index = idx
-              q += wsp
-              idx += 1
-              i += 1
-            }
+
+            numDelim_Standard = 0
+            numDelim_Plus = 0
+            numDelim_Star = 0
           }
+        }
+      } else if (delimType == "lsp") {
+        delim match {
+          case lsp: LSPDelim => numDelim_Standard += 1
+          case lsp: LSPPlusDelim => numDelim_Plus += 1
+          case lsp: LSPStarDelim => numDelim_Star += 1
+          case _ => {
+            val (newQ, newIdx) = updateQueue(numDelim_Standard, numDelim_Plus, numDelim_Star, q, idx, delimType)
+            q.clear()
+            newQ.foreach(x => q += x)
+
+            idx = newIdx
+            delim.index = idx
+            q += delim
+            idx += 1
 
-          // Set the delimiter's index, needed to
-          // update the delimBuf individual node (DelimBase) state later
-          delim.index = idx
-          q += delim
-          idx += 1
+            numDelim_Standard = 0
+            numDelim_Plus = 0
+            numDelim_Star = 0
+          }
+        }
+      } else if (delimType == "sp") {
+        delim match {
+          case sp: SPDelim => numDelim_Standard += 1
+          case sp: SPPlusDelim => numDelim_Plus += 1
+          case sp: SPStarDelim => numDelim_Star += 1
+          case _ => {
+            val (newQ, newIdx) = updateQueue(numDelim_Standard, numDelim_Plus, numDelim_Star, q, idx, delimType)
+            q.clear()
+            newQ.foreach(x => q += x)
+
+            idx = newIdx
+            delim.index = idx
+            q += delim
+            idx += 1
 
-          // Reset counters
-          numWSP = 0
-          numWSP_Plus = 0
-          numWSP_Star = 0
+            numDelim_Standard = 0
+            numDelim_Plus = 0
+            numDelim_Star = 0
+          }
         }
       }
-    }) // end-for-each
+    })
 
     // Check for leftovers in case the delimiter
     // ends in spaces
-    val result = getReducedDelim(numWSP, numWSP_Plus, numWSP_Star)
+    val (newQ, newIdx) = updateQueue(numDelim_Standard, numDelim_Plus, numDelim_Star, q, idx, delimType)
+    q.clear()
+    newQ.foreach(x => q += x)
+
+    q.toArray[DelimBase]
+  }
+
+  def updateQueue(numDelim_Standard: Int, numDelim_Plus: Int, numDelim_Star: Int, q: Queue[DelimBase], idx: Int, delimType: String) = {
+    var localIdx = idx
+    val q2: Queue[DelimBase] = new Queue[DelimBase]()
+    q.foreach(x => q2 += x)   //q2 is now q and q2 is mutable.
+
+    val result = getReducedDelim(numDelim_Standard, numDelim_Plus, numDelim_Star, delimType)
 
     if (result.isDefined) {
       val x = result.get
-      x.index = idx
-      q += x
+      // Delim exists and an equivalent representation was found
+      x.index = localIdx // Set the delimiter's index
+      q2 += x
+      localIdx += 1
     } else {
       // Reduction not possible, but did we come across
-      // more than one WSP?
-
+      // more than one delim?
       var i = 0
-      while (i < numWSP) {
-        val wsp = new WSPDelim
-        wsp.index = idx
-        q += wsp
-        idx += 1
+      while (i < numDelim_Standard) {
+        if(delimType == "wsp"){
+          val wsp = new WSPDelim
+          wsp.index = localIdx
+          q2 += wsp
+        }else if (delimType == "lsp"){
+          val lsp = new LSPDelim
+          lsp.index = localIdx
+          q2 += lsp
+        }else{
+          val sp = new SPDelim
+          sp.index = localIdx
+          q2 += sp
+        }
+        localIdx += 1
         i += 1
       }
     }
 
-    q.toArray[DelimBase]
+    (q2, localIdx)
   }
 
   // Based upon what WSP delimiters were encountered,
   // determine the equivalent representation (if any) and return it.
-  //
-  def getReducedDelim(numWSP: Int, numWSP_Plus: Int, numWSP_Star: Int): Maybe[DelimBase] = {
-    // 				TRUTH TABLE
-    //		WSP		WSP+	WSP*	RESULT
-    // 1	0		0		0		NONE
-    // 2	0		0		1		WSP*
-    // 3	0		1		0		WSP+
-    // 4	0		1		1		WSP+
-    // 5	1		0		0		WSP
-    // 6	1		0		1		WSP+
-    // 7	1		1		0		WSP+
-    // 8	1		1		1		WSP+
-    if (numWSP_Plus != 0) {
+  // 				TRUTH TABLE
+  //		delim		delim+	delim*	RESULT
+  // 1	0		0		0		NONE
+  // 2	0		0		1		delim*
+  // 3	0		1		0		delim+
+  // 4	0		1		1		delim+
+  // 5	1		0		0		delim
+  // 6	1		0		1		delim+
+  // 7	1		1		0		delim+
+  // 8	1		1		1		delim+
+  def getReducedDelim(numDelim: Int, numDelim_Plus: Int, numDelim_Star: Int, delimType: String): Maybe[DelimBase] = {
+    if (numDelim_Plus != 0) {

Review Comment:
   Any thoughts on making this change so it better matches the truth table at the top?



##########
daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/dfa/Rules.scala:
##########
@@ -599,25 +601,53 @@ class CharState(states: => ArrayBuffer[State], char: Char, val nextState: Int, v
   }
 }
 
-abstract class WSPBase(states: => ArrayBuffer[State])
-  extends DelimStateBase(states) with WSP {
+abstract class DelimsBase(states: => ArrayBuffer[State], delimType: String)
+  extends DelimStateBase(states) with WSP with LSP with SP {
+  override lazy val SPACE: Char = '\u0020'
+  override lazy val CTRL0 = delimType match {
+    case "wsp" | "wsp+" | "wsp*" | "lsp" | "lsp+" | "lsp*" => '\u0009'
+  }
+  override lazy val allChars = delimType match {
+    case "wsp" | "wsp+" | "wsp*" => Seq(CTRL0, CTRL1, CTRL2, CTRL3, CTRL4, SPACE, NEL,
+    NBSP, OGHAM, MONG, SP0, SP1, SP2, SP3, SP4, SP5, SP6, SP7, SP8, SP9, SP10,
+    LSP, PSP, NARROW, MED, IDE)
+    case "lsp" | "lsp+" | "lsp*" => Seq(SPACE, CTRL0)
+    case  "sp+" | "sp*" => Seq(SPACE)
+  }
 
   def checkMatch(charIn: Char): Boolean = {
-    val result = charIn match {
-      case CTRL0 | CTRL1 | CTRL2 | CTRL3 | CTRL4 => true
-      case SPACE | NEL | NBSP | OGHAM | MONG => true
-      case SP0 | SP1 | SP2 | SP3 | SP4 | SP5 | SP6 | SP7 | SP8 | SP9 | SP10 => true
-      case LSP | PSP | NARROW | MED | IDE => true
-      case _ => false
+    val result = delimType match {
+      case "wsp" | "wsp+" | "wsp*" =>

Review Comment:
   This checkMatch functions need to be as fast as possible, we probably want to avoid having to do a string compare aginst the delimType. Instead of passing in delimType, I would suggest maybe having a unqie class for each character class and he checkMatch function then knows exactly what it should match without having to compare against a passed in `delimType`.
   
   So we'll bring back all the WSP States that you replaced, and also add LSP and SP States as well. Having a DelimBase or DelimPlus/Star/etc bases might still make sense if the goal here was to reduce duplication.



##########
daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/DFDLDelimiter.scala:
##########
@@ -57,6 +57,11 @@ class Delimiter {
   private lazy val WSP_Plus = Pattern.compile("(WSP\\+);", Pattern.MULTILINE)
   private lazy val WSP_Star = Pattern.compile("(WSP\\*);", Pattern.MULTILINE)
   private lazy val ES = Pattern.compile("(ES);", Pattern.MULTILINE)
+  private lazy val LSP = Pattern.compile("(LSP);")
+  private lazy val LSP_Plus = Pattern.compile("(LSP\\+);")
+  private lazy val LSP_Star = Pattern.compile("(LSP\\*);")
+  private lazy val SP_Plus = Pattern.compile("(SP\\+);")
+  private lazy val SP_Star = Pattern.compile("(SP\\*);")

Review Comment:
   Should these all have `Pattern.MULTILINE` like above? I'm not really sure why MULTLINE is needed here. Maybe it's only needed for WSP since that can match a newline?



##########
daffodil-runtime1/src/main/scala/org/apache/daffodil/dsom/CompiledExpression1.scala:
##########
@@ -93,10 +93,10 @@ abstract class CompiledExpression[+T <: AnyRef](
 
   /**
    * Tells us if the expression can match the empty string. We know it can if the expression
-   * is a DFDL entity like %ES; or %WSP*. We do not know whether it can if it is a more
+   * is a DFDL entity like %ES; or %WSP*; or %LSP;. We do not know whether it can if it is a more

Review Comment:
   Comment mentions LSP but should mention LSP*. Should also mention SP*, right?



##########
daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/DFDLDelimiter.scala:
##########
@@ -574,6 +667,38 @@ abstract class WSPBase extends DelimBase with WSP {
   }
 }
 
+abstract class LSPBase extends DelimBase with LSP {
+  lazy val typeName = "LSPBase"
+  def checkMatch(charIn: Char): Boolean = {
+    charIn match {
+      case SPACE | CTRL0 => isMatched = true
+      case _ => isMatched = false
+    }
+    isMatched
+  }
+
+  def printStr = {
+    val res = typeName
+    res
+  }
+}
+
+abstract class SPBase extends DelimBase with SP {
+  lazy val typeName = "SPBase"
+  def checkMatch(charIn: Char): Boolean = {
+    charIn match {
+      case SPACE => isMatched = true
+      case _ => isMatched = false
+    }
+    isMatched

Review Comment:
   Same here, are we missing some SP tests?



##########
daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/DFDLDelimiter.scala:
##########
@@ -86,112 +92,183 @@ class Delimiter {
   // it may not be.
   //
   def reduceDelimBuf(delims: Array[DelimBase]): Array[DelimBase] = {
+    var reducedWSP = reduceWhitespaceDelimiters(delims, "wsp")
+    var reducedLSP = reduceWhitespaceDelimiters(reducedWSP, "lsp")
+    var reducedDelims = reduceWhitespaceDelimiters(reducedLSP, "sp")
+    reducedDelims
+  }
 
+  def reduceWhitespaceDelimiters(delims: Array[DelimBase], delimType: String): Array[DelimBase] = {
     val q: Queue[DelimBase] = new Queue[DelimBase]()
 
-    // Counters to keep track of WSP,+,* objects
-    var numWSP: Int = 0
-    var numWSP_Plus: Int = 0
-    var numWSP_Star: Int = 0
+    // Counters to keep track of delim standard,+,* objects
+    var numDelim_Standard: Int = 0
+    var numDelim_Plus: Int = 0
+    var numDelim_Star: Int = 0
 
     var idx: Int = 0 // To index the resultant array
 
     delims.foreach(delim => {
-      delim match {
-        case wsp: WSPDelim => numWSP += 1
-        case wsp: WSPPlusDelim => numWSP_Plus += 1
-        case wsp: WSPStarDelim => numWSP_Star += 1
-        case _ => {
-          // We've reached a non WSP delimiter, check if we've
-          // previously encountered any WSP delimiter objects and
-          // return the equivalent representation (if any)
-          val result = getReducedDelim(numWSP, numWSP_Plus, numWSP_Star)
-
-          if (result.isDefined) {
-            val x = result.get
-            // WSP exists and an equivalent representation was found
-            x.index = idx // Set the delimiter's index
-            q += x
+      if (delimType == "wsp") {
+        delim match {
+          case wsp: WSPDelim => numDelim_Standard += 1
+          case wsp: WSPPlusDelim => numDelim_Plus += 1
+          case wsp: WSPStarDelim => numDelim_Star += 1
+          case _ => {
+            val (newQ, newIdx) = updateQueue(numDelim_Standard, numDelim_Plus, numDelim_Star, q, idx, delimType)
+            q.clear()
+            newQ.foreach(x => q += x)
+
+            idx = newIdx
+            delim.index = idx
+            q += delim
             idx += 1
-          } else {
-            // Reduction not possible, but did we come across
-            // more than one WSP?
-
-            var i = 0
-            while (i < numWSP) {
-              val wsp = new WSPDelim
-              wsp.index = idx
-              q += wsp
-              idx += 1
-              i += 1
-            }
+
+            numDelim_Standard = 0
+            numDelim_Plus = 0
+            numDelim_Star = 0
           }
+        }
+      } else if (delimType == "lsp") {
+        delim match {
+          case lsp: LSPDelim => numDelim_Standard += 1
+          case lsp: LSPPlusDelim => numDelim_Plus += 1
+          case lsp: LSPStarDelim => numDelim_Star += 1
+          case _ => {
+            val (newQ, newIdx) = updateQueue(numDelim_Standard, numDelim_Plus, numDelim_Star, q, idx, delimType)
+            q.clear()
+            newQ.foreach(x => q += x)
+
+            idx = newIdx
+            delim.index = idx
+            q += delim
+            idx += 1
 
-          // Set the delimiter's index, needed to
-          // update the delimBuf individual node (DelimBase) state later
-          delim.index = idx
-          q += delim
-          idx += 1
+            numDelim_Standard = 0
+            numDelim_Plus = 0
+            numDelim_Star = 0
+          }
+        }
+      } else if (delimType == "sp") {
+        delim match {
+          case sp: SPDelim => numDelim_Standard += 1
+          case sp: SPPlusDelim => numDelim_Plus += 1
+          case sp: SPStarDelim => numDelim_Star += 1
+          case _ => {
+            val (newQ, newIdx) = updateQueue(numDelim_Standard, numDelim_Plus, numDelim_Star, q, idx, delimType)
+            q.clear()
+            newQ.foreach(x => q += x)
+
+            idx = newIdx
+            delim.index = idx
+            q += delim
+            idx += 1
 
-          // Reset counters
-          numWSP = 0
-          numWSP_Plus = 0
-          numWSP_Star = 0
+            numDelim_Standard = 0
+            numDelim_Plus = 0
+            numDelim_Star = 0
+          }
         }
       }
-    }) // end-for-each
+    })
 
     // Check for leftovers in case the delimiter
     // ends in spaces
-    val result = getReducedDelim(numWSP, numWSP_Plus, numWSP_Star)
+    val (newQ, newIdx) = updateQueue(numDelim_Standard, numDelim_Plus, numDelim_Star, q, idx, delimType)
+    q.clear()
+    newQ.foreach(x => q += x)
+
+    q.toArray[DelimBase]
+  }
+
+  def updateQueue(numDelim_Standard: Int, numDelim_Plus: Int, numDelim_Star: Int, q: Queue[DelimBase], idx: Int, delimType: String) = {
+    var localIdx = idx
+    val q2: Queue[DelimBase] = new Queue[DelimBase]()
+    q.foreach(x => q2 += x)   //q2 is now q and q2 is mutable.
+
+    val result = getReducedDelim(numDelim_Standard, numDelim_Plus, numDelim_Star, delimType)
 
     if (result.isDefined) {
       val x = result.get
-      x.index = idx
-      q += x
+      // Delim exists and an equivalent representation was found
+      x.index = localIdx // Set the delimiter's index
+      q2 += x
+      localIdx += 1
     } else {
       // Reduction not possible, but did we come across
-      // more than one WSP?
-
+      // more than one delim?
       var i = 0
-      while (i < numWSP) {
-        val wsp = new WSPDelim
-        wsp.index = idx
-        q += wsp
-        idx += 1
+      while (i < numDelim_Standard) {
+        if(delimType == "wsp"){
+          val wsp = new WSPDelim
+          wsp.index = localIdx
+          q2 += wsp
+        }else if (delimType == "lsp"){
+          val lsp = new LSPDelim
+          lsp.index = localIdx
+          q2 += lsp
+        }else{
+          val sp = new SPDelim
+          sp.index = localIdx
+          q2 += sp
+        }
+        localIdx += 1
         i += 1
       }
     }
 
-    q.toArray[DelimBase]
+    (q2, localIdx)
   }
 
   // Based upon what WSP delimiters were encountered,
   // determine the equivalent representation (if any) and return it.
-  //
-  def getReducedDelim(numWSP: Int, numWSP_Plus: Int, numWSP_Star: Int): Maybe[DelimBase] = {
-    // 				TRUTH TABLE
-    //		WSP		WSP+	WSP*	RESULT
-    // 1	0		0		0		NONE
-    // 2	0		0		1		WSP*
-    // 3	0		1		0		WSP+
-    // 4	0		1		1		WSP+
-    // 5	1		0		0		WSP
-    // 6	1		0		1		WSP+
-    // 7	1		1		0		WSP+
-    // 8	1		1		1		WSP+
-    if (numWSP_Plus != 0) {
+  // 				TRUTH TABLE
+  //		delim		delim+	delim*	RESULT
+  // 1	0		0		0		NONE

Review Comment:
   Still looks off, might be mixture of tabs and spaces issue?



##########
daffodil-runtime1/src/main/scala/org/apache/daffodil/processors/DFDLDelimiter.scala:
##########
@@ -574,6 +667,38 @@ abstract class WSPBase extends DelimBase with WSP {
   }
 }
 
+abstract class LSPBase extends DelimBase with LSP {
+  lazy val typeName = "LSPBase"
+  def checkMatch(charIn: Char): Boolean = {
+    charIn match {
+      case SPACE | CTRL0 => isMatched = true
+      case _ => isMatched = false
+    }
+    isMatched

Review Comment:
   Surprised this isn't hit. Do we not have tests for LSP?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@daffodil.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org