You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@daffodil.apache.org by sl...@apache.org on 2018/04/05 16:04:05 UTC

[incubator-daffodil] branch master updated: Improved toString of grammar and parser/unparser objects.

This is an automated email from the ASF dual-hosted git repository.

slawrence pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-daffodil.git


The following commit(s) were added to refs/heads/master by this push:
     new 8a29c60  Improved toString of grammar and parser/unparser objects.
8a29c60 is described below

commit 8a29c60b5c5857f17bc63298345fbde24a6d0a8c
Author: Michael Beckerle <mb...@tresys.com>
AuthorDate: Thu Mar 29 12:24:50 2018 -0400

    Improved toString of grammar and parser/unparser objects.
    
    This is just to assist in debugging some thorny problems.
    
    DAFFODIL-1920
---
 .../org/apache/daffodil/grammar/Grammar.scala      | 12 +--
 .../org/apache/daffodil/grammar/Production.scala   |  2 +
 .../grammar/primitives/ElementCombinator.scala     |  4 +
 .../primitives/PrimitivesElementKinds.scala        | 24 +++++-
 .../scala/org/apache/daffodil/xml/XMLUtils.scala   | 85 +++++++++++++++++++---
 .../daffodil/xml/test/unit/TestXMLUtils.scala      | 32 ++++++++
 .../unparsers/ElementKindUnparsers.scala           |  1 -
 7 files changed, 143 insertions(+), 17 deletions(-)

diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/grammar/Grammar.scala b/daffodil-core/src/main/scala/org/apache/daffodil/grammar/Grammar.scala
index 7f02259..eeb7bc2 100644
--- a/daffodil-core/src/main/scala/org/apache/daffodil/grammar/Grammar.scala
+++ b/daffodil-core/src/main/scala/org/apache/daffodil/grammar/Grammar.scala
@@ -180,8 +180,13 @@ abstract class NamedGram(context: SchemaComponent) extends Gram(context) {
   // Note: keep the toString really simple.
   // It causes much grief if toString uses complicated things that can fail or
   // that end up needing the name of this NamedGram again.
-  override def toString = name // + "(" + context.scPath.last + ")" //+ (if (isEmpty) "(Empty)" else "")
 
+  override def name = context match {
+    case nm: NamedMixin => nm.name
+    case _ => super.name
+  }
+
+  override def toString = "<" + name + ">" + super.name + "</" + name + ">"
 }
 
 /**
@@ -192,9 +197,4 @@ abstract class Terminal(contextArg: SchemaComponent, guard: Boolean)
 
   override def isEmpty = !guard
 
-  private lazy val realSC = context.asInstanceOf[SchemaComponent]
-  final override lazy val path = realSC.path + "@@" + diagnosticDebugName
-
-  override def toString = path // dangerous. What if realSC.path fails?
-
 }
diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/grammar/Production.scala b/daffodil-core/src/main/scala/org/apache/daffodil/grammar/Production.scala
index 43dd12b..40a4580 100644
--- a/daffodil-core/src/main/scala/org/apache/daffodil/grammar/Production.scala
+++ b/daffodil-core/src/main/scala/org/apache/daffodil/grammar/Production.scala
@@ -48,6 +48,8 @@ final class Prod(nameArg: String, val sc: SchemaComponent, guard: Boolean, gramA
 
   final override def name = nameArg
 
+  override def toString() = "<" + name + ">" + gram.toString + "</" + name + ">"
+
   final override lazy val path = sc.path + "@@Prod(" + diagnosticDebugName + ")"
 
   final override lazy val gram: Gram = {
diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/grammar/primitives/ElementCombinator.scala b/daffodil-core/src/main/scala/org/apache/daffodil/grammar/primitives/ElementCombinator.scala
index 930ebdf..7dcc7b1 100644
--- a/daffodil-core/src/main/scala/org/apache/daffodil/grammar/primitives/ElementCombinator.scala
+++ b/daffodil-core/src/main/scala/org/apache/daffodil/grammar/primitives/ElementCombinator.scala
@@ -76,6 +76,8 @@ class ElementCombinator(context: ElementBase,
   extends NamedGram(context)
   with Padded {
 
+  override def toString = subComb.toString() // parse centric view of the world. Unparser doesn't use subComb at all.
+
   private lazy val subComb = {
     if (context.isParentUnorderedSequence) {
       new ChoiceElementCombinator(context, eBeforeContent,
@@ -407,6 +409,8 @@ class ChoiceElementCombinator(context: ElementBase, eGramBefore: Gram, eGram: Gr
 abstract class ElementCombinatorBase(context: ElementBase, eGramBefore: Gram, eGram: Gram, eGramAfter: Gram)
   extends NamedGram(context) {
 
+  override def toString() = "<element name='" + name + "'>" + eGram.toString() + "</element>"
+
   // The order of things matters in some cases, so to be consistent we'll always use the
   // same order even when it doesn't matter
 
diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/grammar/primitives/PrimitivesElementKinds.scala b/daffodil-core/src/main/scala/org/apache/daffodil/grammar/primitives/PrimitivesElementKinds.scala
index f8f2dcf..549c4a0 100644
--- a/daffodil-core/src/main/scala/org/apache/daffodil/grammar/primitives/PrimitivesElementKinds.scala
+++ b/daffodil-core/src/main/scala/org/apache/daffodil/grammar/primitives/PrimitivesElementKinds.scala
@@ -45,6 +45,8 @@ import org.apache.daffodil.exceptions.Assert
 import org.apache.daffodil.util.Maybe._
 import org.apache.daffodil.cookers.ChoiceBranchKeyCooker
 import org.apache.daffodil.api.WarnID
+import org.apache.daffodil.util.Misc
+import org.apache.daffodil.xml.XMLUtils
 
 object ENoWarn3 { EqualitySuppressUnusedImportWarning() }
 
@@ -81,7 +83,15 @@ case class DelimiterStackCombinatorElement(e: ElementBase, body: Gram) extends T
   lazy val uInit = if (e.initiatorParseEv.isKnownNonEmpty) One(e.initiatorUnparseEv) else Nope
   lazy val uTerm = if (e.terminatorParseEv.isKnownNonEmpty) One(e.terminatorUnparseEv) else Nope
 
-  lazy val parser: DaffodilParser = new DelimiterStackParser((pInit.toList ++ pTerm.toList).toArray, e.termRuntimeData, body.parser)
+  lazy val delims = (pInit.toList ++ pTerm.toList)
+
+  override def toString() = {
+    val delimAttrib = delims.map { _.toString }.map { XMLUtils.escape(_).toString() }.mkString(" ")
+    "<" + Misc.getNameFromClass(this) + " delims='" + delimAttrib + "'>" +
+      body.toString() +
+      "</" + Misc.getNameFromClass(this) + ">"
+  }
+  lazy val parser: DaffodilParser = new DelimiterStackParser(delims.toArray, e.termRuntimeData, body.parser)
 
   override lazy val unparser: DaffodilUnparser = new DelimiterStackUnparser(uInit, None, uTerm, e.termRuntimeData, body.unparser)
 }
@@ -103,6 +113,11 @@ case class ComplexTypeCombinator(ct: ComplexTypeBase, body: Gram) extends Termin
 
   override def isEmpty = body.isEmpty
 
+  override def toString() =
+    "<" + Misc.getNameFromClass(this) + ">" +
+      body.toString() +
+      "</" + Misc.getNameFromClass(this) + ">"
+
   lazy val parser: DaffodilParser = new ComplexTypeParser(ct.runtimeData, body.parser)
 
   override lazy val unparser: DaffodilUnparser =
@@ -119,6 +134,11 @@ case class SequenceCombinator(sq: SequenceTermBase, rawTerms: Seq[Gram])
     res
   }
 
+  override def toString() =
+    "<" + Misc.getNameFromClass(this) + ">" +
+      terms.map { _.toString() }.mkString +
+      "</" + Misc.getNameFromClass(this) + ">"
+
   private val mt: Gram = EmptyGram
   lazy val body = rawTerms.foldRight(mt) { _ ~ _ }
 
@@ -142,6 +162,7 @@ case class UnorderedSequenceCombinator(s: Sequence, terms: Seq[Gram])
 }
 
 case class ArrayCombinator(e: ElementBase, body: Gram) extends Terminal(e, !body.isEmpty) {
+  override def toString() = "<Array>" + body.toString + "</Array>"
 
   lazy val parser: DaffodilParser = new ArrayCombinatorParser(e.elementRuntimeData, body.parser)
   override lazy val unparser: Unparser = new ArrayCombinatorUnparser(e.elementRuntimeData, body.unparser)
@@ -149,6 +170,7 @@ case class ArrayCombinator(e: ElementBase, body: Gram) extends Terminal(e, !body
 
 case class OptionalCombinator(e: ElementBase, body: Gram) extends Terminal(e, !body.isEmpty) {
 
+  override def toString() = "<Optional>" + body.toString + "</Optional>"
   lazy val parser: DaffodilParser = new OptionalCombinatorParser(e.elementRuntimeData, body.parser)
   override lazy val unparser: Unparser = new OptionalCombinatorUnparser(e.elementRuntimeData, body.unparser)
 }
diff --git a/daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala b/daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
index c363f67..c186212 100644
--- a/daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
+++ b/daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala
@@ -50,23 +50,25 @@ object XMLUtils {
   /**
    * Legal XML v1.0 chars are #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
    */
-  def remapXMLIllegalCharToPUA(checkForExistingPUA: Boolean = true)(c: Char): Char = {
+  def remapXMLIllegalCharToPUA(checkForExistingPUA: Boolean = true, replaceCRWithLF: Boolean = true)(c: Char): Char = {
     val cInt = c.toInt
     val res = cInt match {
       case 0x9 => c
       case 0xA => c
-      case 0xD => 0xA.toChar // Map CR to LF. That's what XML does.
-      case _ if (c < 0x20) => (c + 0xE000).toChar
-      case _ if (c > 0xD7FF && c < 0xE000) => (c + 0x1000).toChar
-      case _ if (c >= 0xE000 && c <= 0xF8FF) => {
+      case 0xD =>
+        if (replaceCRWithLF) 0xA.toChar // Map CR to LF. That's what XML does.
+        else 0xE00D.toChar // or remap it to PUA so it is non-whitespace, and preserved.
+      case _ if (cInt < 0x20) => (cInt + 0xE000).toChar
+      case _ if (cInt > 0xD7FF && cInt < 0xE000) => (cInt + 0x1000).toChar
+      case _ if (cInt >= 0xE000 && cInt <= 0xF8FF) => {
         if (checkForExistingPUA)
           Assert.usageError("Pre-existing Private Use Area (PUA) character found in data: '%s'".format(c))
         else c
       }
       case 0xFFFE => 0xF0FE.toChar
       case 0xFFFF => 0xF0FF.toChar
-      case _ if (c > 0x10FFFF) => {
-        Assert.invariantFailed("Character code beyond U+10FFFF found in data. Codepoint: %s".format(c.toInt))
+      case _ if (cInt > 0x10FFFF) => {
+        Assert.invariantFailed("Character code beyond U+10FFFF found in data. Codepoint: %s".format(cInt))
       }
       case _ => c
 
@@ -356,7 +358,7 @@ object XMLUtils {
   val EXT_NS_APACHE = NS(DAFFODIL_EXTENSION_NAMESPACE_APACHE.uri)
 
   private val DAFFODIL_INTERNAL_NAMESPACE = NS(DAFFODIL_EXTENSIONS_NAMESPACE_ROOT_APACHE + ":int")
-  val INT_PREFIX= "dafint"
+  val INT_PREFIX = "dafint"
   val INT_NS = NS(DAFFODIL_INTERNAL_NAMESPACE.uri)
 
   val FILE_ATTRIBUTE_NAME = "file"
@@ -423,7 +425,7 @@ object XMLUtils {
   def dafAttributes(n: Node) = {
     n.attributes.filter { a =>
       a.getNamespace(n) == XMLUtils.EXT_NS_NCSA.toString ||
-      a.getNamespace(n) == XMLUtils.EXT_NS_APACHE.toString
+        a.getNamespace(n) == XMLUtils.EXT_NS_APACHE.toString
     }
   }
 
@@ -905,6 +907,71 @@ Differences were (path, expected, actual):
     tmpSchemaFile
   }
 
+  /**
+   * Strong escaping that never loses information, handles apos and CR right.
+   *
+   * Escapes apostrophe (single quote) as well as the other XML escaped chars.
+   * Remaps CR and any other XML-illegals into PUA. Replaces whitespace with
+   * numeric character entities for additional safety.
+   *
+   * This is needed since XML may be using single quotes to surround a string which
+   * might contain single quotes.
+   *
+   * The reason basic scala.xml.Utility.escape doesn't escape single-quotes is
+   * HTML compatibility. HTML doesn't define an "&apos;" entity.
+   *
+   * Furthermore, since some potentially illegal XML characters may be used here, we
+   * are going to remap all the illegal XML characters to their corresponding PUA characters.
+   *
+   * Lastly, all whitespace chars are replaced by numeric character entities, and
+   * anything above 0xFF that is not considered letter or digit, is also replaced
+   * by a numeric character entity.
+   *
+   * The result is a string which can be displayed as an XML attribute value, is
+   * invertible back to the original string.
+   *
+   * Finally, CRLF and CR will come through as &#xE00D;&#xA; that's because
+   * if we used &#xD; for the CR, it might be converted to a LF by XML readers.
+   * We have to use our own PUA remapping trick if we want to be sure to preserve
+   * CR in XML.
+   */
+  def escape(str: String, sb: StringBuilder = new StringBuilder()): StringBuilder = {
+    var i = 0
+    while (i < str.length) {
+      val x = str(i)
+      val c = escapeMapper(x)
+      i += 1
+      c match {
+        case '\'' => sb.append("&#x27;") // don't use "&apos;" because it's not universally accepted (HTML doesn't have it in early versions)
+        case '"' => sb.append("&quot;")
+        case '&' => sb.append("&amp;")
+        case '<' => sb.append("&lt;")
+        case '>' => sb.append("&gt;")
+        case _ if (c.isLetterOrDigit) => sb.append(c)
+        case _ if (c.isWhitespace || c.isControl) => toNumericCharacterEntity(c, sb)
+        // A0 is the NBSP character - not considered whitespace, but no glyph, so we need it numeric
+        case _ if (c.toInt == 0xA0) => toNumericCharacterEntity(c, sb)
+        // Any other char < 256 is punctuation or other glyph char
+        case _ if (c.toInt < 0xFF) => sb.append(c)
+        case _ => toNumericCharacterEntity(c, sb)
+      }
+    }
+    sb
+  }
+
+  private val escapeMapper =
+    remapXMLIllegalCharToPUA(
+      checkForExistingPUA = false,
+      replaceCRWithLF = false) _
+
+  def toNumericCharacterEntity(c: Char, sb: StringBuilder) = {
+    val i = c.toInt
+    Assert.usage(i > 0) // NUL cannot be represented at all in XML.
+    val s = Integer.toHexString(i).toUpperCase()
+    sb.append("&#x")
+    sb.append(s)
+    sb.append(";")
+  }
 }
 
 trait GetAttributesMixin extends ThrowsSDE {
diff --git a/daffodil-lib/src/test/scala/org/apache/daffodil/xml/test/unit/TestXMLUtils.scala b/daffodil-lib/src/test/scala/org/apache/daffodil/xml/test/unit/TestXMLUtils.scala
index 6d1ab23..dc9f04e 100644
--- a/daffodil-lib/src/test/scala/org/apache/daffodil/xml/test/unit/TestXMLUtils.scala
+++ b/daffodil-lib/src/test/scala/org/apache/daffodil/xml/test/unit/TestXMLUtils.scala
@@ -231,4 +231,36 @@ class TestXMLUtils {
     assertEquals("&&&", res(0).text)
   }
 
+  @Test def testEscapeLineEndings() {
+    val input = "abc\r\ndef\rghi\njkl\tmno\u0085pqr"
+    val actual = XMLUtils.escape(input).toString()
+    assertEquals("abc&#xE00D;&#xA;def&#xE00D;ghi&#xA;jkl&#x9;mno&#x85;pqr", actual)
+  }
+
+  @Test def testEscape0To127() {
+    val input = (0 to 127).map { _.toChar }.mkString
+    val actual = XMLUtils.escape(input).toString()
+    val expected = "&#xE000;&#xE001;&#xE002;&#xE003;&#xE004;&#xE005;&#xE006;&#xE007;&#xE008;" + // first batch of C0 controls
+      "&#x9;&#xA;" + // Tab and LF
+      "&#xE00B;&#xE00C;" + // more C0 controls
+      "&#xE00D;" + // CR
+      // Even more of the C0 controls.
+      "&#xE00E;&#xE00F;&#xE010;&#xE011;&#xE012;&#xE013;&#xE014;&#xE015;&#xE016;&#xE017;&#xE018;&#xE019;&#xE01A;&#xE01B;&#xE01C;&#xE01D;&#xE01E;&#xE01F;" +
+      "&#x20;" + // space is whitespace comes through numeric.
+      "!&quot;#$%&amp;" + // XML Entities for quot, amp
+      "&#x27;" + // numeric entity for apos aka single quote (because &apos; is not universal, i.e., not in HTML
+      "()*+,-./0123456789:;&lt;=&gt" + // XML entities for lt, gt
+      ";?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[" + // all printing characters
+      "\\" + // backslash char needs escape. This is ONE character
+      "]^_`abcdefghijklmnopqrstuvwxyz{|}~" + // all printing characters
+      "&#x7F;" // DEL is a control char, so numeric entity for that too.
+    assertEquals(expected, actual)
+  }
+
+  @Test def testEscape128To255() {
+    val input = (128 to 255).map { _.toChar }.mkString
+    val actual = XMLUtils.escape(input).toString()
+    val expected = "&#x80;&#x81;&#x82;&#x83;&#x84;&#x85;&#x86;&#x87;&#x88;&#x89;&#x8A;&#x8B;&#x8C;&#x8D;&#x8E;&#x8F;&#x90;&#x91;&#x92;&#x93;&#x94;&#x95;&#x96;&#x97;&#x98;&#x99;&#x9A;&#x9B;&#x9C;&#x9D;&#x9E;&#x9F;&#xA0;¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
+    assertEquals(expected, actual)
+  }
 }
diff --git a/daffodil-runtime1-unparser/src/main/scala/org/apache/daffodil/processors/unparsers/ElementKindUnparsers.scala b/daffodil-runtime1-unparser/src/main/scala/org/apache/daffodil/processors/unparsers/ElementKindUnparsers.scala
index 088df46..71ff9ca 100644
--- a/daffodil-runtime1-unparser/src/main/scala/org/apache/daffodil/processors/unparsers/ElementKindUnparsers.scala
+++ b/daffodil-runtime1-unparser/src/main/scala/org/apache/daffodil/processors/unparsers/ElementKindUnparsers.scala
@@ -72,7 +72,6 @@ class SequenceCombinatorUnparser(ctxt: ModelGroupRuntimeData, childUnparsers: Ve
     var index = 0
     var doUnparser = false
     val limit = childUnparsers.length
-
     while (index < limit) {
       doUnparser = false
       val childUnparser = childUnparsers(index)

-- 
To stop receiving notification emails like this one, please contact
slawrence@apache.org.