You are viewing a plain text version of this content. The canonical link for it is here.
Posted to users@daffodil.apache.org by "Costello, Roger L." <co...@mitre.org> on 2018/11/14 19:33:43 UTC

Best practice: all inline? no inline? mix?

Hello DFDL Community,

I have a simple data format:

	Label: Message

Here is a sample input:

	Dear Sir: Thank you for your response.

I will take this perspective: 

	The data format consists of a series of
	two strings (label and message),
	separated by colon.

Here is the XML that I wish to produce:

<really-simple-format>
      <label>Dear Sir</label>
      <message>Thank you for your response.</message>
</really-simple-format>

I have identified 3 approaches to design the DFDL schema:

1. Inline the DFDL stuff with the XML schema stuff.

2. Don't inline any DFDL stuff; instead, put all the DFDL stuff at the top, inside xs:annotation.

3. Inline some DFDL stuff, put some DFDL stuff at the top, inside xs:annotation.

Below I show the 3 approaches. Which is best practice? I know you will say the third approach (mix approach) is best practice. Okay, then, which DFDL stuff should be inlined and which should be put at the top, inside xs:annotation? What is the rationale for how you divvy up the DFDL stuff between inline and at the top? Do you agree with how I divvied up the DFDL stuff? Would you put more stuff inline? If so, what other stuff would you put inline?

------------------------------------
    All Inline Approach
------------------------------------

<xs:element name="really-simple-format"
    dfdl:alignment="implicit" 
    dfdl:alignmentUnits="bytes"
    dfdl:encoding="UTF-8"
    dfdl:escapeSchemeRef=""
    dfdl:ignoreCase="no"
    dfdl:initiator=""  
    dfdl:leadingSkip="0"
    dfdl:lengthKind="delimited" 
    dfdl:outputNewLine="%CR;%LF;"
    dfdl:representation="text"
    dfdl:terminator=""
    dfdl:textPadKind="none"
    dfdl:textTrimKind="none"
    dfdl:trailingSkip="0"
    dfdl:truncateSpecifiedLengthString="no"
    >
    <xs:complexType>
        <xs:sequence 
            dfdl:alignment="implicit"  
            dfdl:alignmentUnits="bytes"   
            dfdl:encoding="UTF-8" 
            dfdl:ignoreCase="no"
            dfdl:initiatedContent="no"
            dfdl:initiator="" 
            dfdl:leadingSkip="0" 
            dfdl:lengthKind="delimited" 
            dfdl:outputNewLine="%CR;%LF;"
            dfdl:separator=":" 
            dfdl:separatorPosition="infix"
            dfdl:separatorSuppressionPolicy="never" 
            dfdl:sequenceKind="ordered"
            dfdl:terminator="" 
            dfdl:trailingSkip="0" 
            >
            <xs:element name="label" type="xs:string" 
                dfdl:alignment="implicit" 
                dfdl:alignmentUnits="bytes"
                dfdl:encoding="UTF-8"
                dfdl:escapeSchemeRef=""
                dfdl:ignoreCase="no"
                dfdl:initiator=""  
                dfdl:leadingSkip="0"
                dfdl:lengthKind="delimited" 
                dfdl:outputNewLine="%CR;%LF;"
                dfdl:representation="text"
                dfdl:terminator=""
                dfdl:textPadKind="none"
                dfdl:textTrimKind="none"
                dfdl:trailingSkip="0"
                dfdl:truncateSpecifiedLengthString="no"
            />
            <xs:element name="message" type="xs:string"
                dfdl:alignment="implicit" 
                dfdl:alignmentUnits="bytes"
                dfdl:encoding="UTF-8"
                dfdl:escapeSchemeRef=""
                dfdl:ignoreCase="no"
                dfdl:initiator=""  
                dfdl:leadingSkip="0"
                dfdl:lengthKind="delimited" 
                dfdl:outputNewLine="%CR;%LF;"
                dfdl:representation="text"
                dfdl:terminator=""
                dfdl:textPadKind="none"
                dfdl:textTrimKind="none"
                dfdl:trailingSkip="0"
                dfdl:truncateSpecifiedLengthString="no" 
            />
        </xs:sequence>
    </xs:complexType>
</xs:element>

------------------------------------
    No Inline Approach
------------------------------------

<xs:annotation>
    <xs:appinfo source="http://www.ogf.org/dfdl/">
        <dfdl:format  
            alignment="1" 
            alignmentUnits="bytes"  
            binaryFloatRep="ieee" 
            binaryNumberRep="binary"  
            bitOrder="mostSignificantBitFirst"
            byteOrder="bigEndian"  
            calendarPatternKind="implicit"
            documentFinalTerminatorCanBeMissing="yes" 
            emptyValueDelimiterPolicy="none"
            encoding="ISO-8859-1"
            encodingErrorPolicy="replace" 
            escapeSchemeRef=""  
            fillByte="f" 
            floating="no" 
            ignoreCase="no" 
            initiator="" 
            initiatedContent="no" 
            leadingSkip="0" 
            lengthKind="delimited"
            lengthUnits="bits"  
            nilKind="literalValue"  
            nilValueDelimiterPolicy="none"
            occursCountKind="implicit"
            outputNewLine="%CR;%LF;"
            representation="text"
            separator=":" 
            separatorPosition="infix"
            separatorSuppressionPolicy="never"  
            sequenceKind="ordered" 
            terminator=""   
            textBidi="no" 
            textNumberCheckPolicy="strict"
            textNumberPattern="#,##0.###;-#,##0.###" 
            textNumberRep="standard" 
            textNumberRounding="explicit"  
            textNumberRoundingIncrement="0"
            textNumberRoundingMode="roundUnnecessary" 
            textOutputMinLength="0" 
            textPadKind="none" 
            textStandardBase="10"
            textStandardExponentRep="E"
            textStandardInfinityRep="Inf"  
            textStandardNaNRep="NaN"
            textStandardZeroRep="0" 
            textStandardGroupingSeparator="," 
            textTrimKind="none" 
            trailingSkip="0" 
            truncateSpecifiedLengthString="no" 
            utf16Width="fixed" 
        />
    </xs:appinfo>
</xs:annotation>

<xs:element name="really-simple-format">
    <xs:complexType>
        <xs:sequence>
            <xs:element name="label" type="xs:string" />
            <xs:element name="message" type="xs:string" />
        </xs:sequence>
    </xs:complexType>
</xs:element>

------------------------------------
    Mix Approach
------------------------------------

<xs:annotation>
    <xs:appinfo source="http://www.ogf.org/dfdl/">
        <dfdl:format  
            alignment="1" 
            alignmentUnits="bytes"  
            binaryFloatRep="ieee" 
            binaryNumberRep="binary"  
            bitOrder="mostSignificantBitFirst"
            byteOrder="bigEndian"  
            calendarPatternKind="implicit"
            documentFinalTerminatorCanBeMissing="yes" 
            emptyValueDelimiterPolicy="none"
            encoding="ISO-8859-1"
            encodingErrorPolicy="replace" 
            escapeSchemeRef=""  
            fillByte="f" 
            floating="no" 
            ignoreCase="no" 
            initiator="" 
            initiatedContent="no" 
            leadingSkip="0" 
            lengthKind="delimited"
            lengthUnits="bits"  
            nilKind="literalValue"  
            nilValueDelimiterPolicy="none"
            occursCountKind="implicit"
            outputNewLine="%CR;%LF;"
            representation="text" 
            separatorSuppressionPolicy="never"  
            sequenceKind="ordered" 
            terminator=""   
            textBidi="no" 
            textNumberCheckPolicy="strict"
            textNumberPattern="#,##0.###;-#,##0.###" 
            textNumberRep="standard" 
            textNumberRounding="explicit"  
            textNumberRoundingIncrement="0"
            textNumberRoundingMode="roundUnnecessary" 
            textOutputMinLength="0" 
            textPadKind="none" 
            textStandardBase="10"
            textStandardExponentRep="E"
            textStandardInfinityRep="Inf"  
            textStandardNaNRep="NaN"
            textStandardZeroRep="0" 
            textStandardGroupingSeparator="," 
            textTrimKind="none" 
            trailingSkip="0" 
            truncateSpecifiedLengthString="no" 
            utf16Width="fixed" 
        />
    </xs:appinfo>
</xs:annotation>

<xs:element name="really-simple-format">
    <xs:complexType>
        <xs:sequence dfdl:separator=":" dfdl:separatorPosition="infix">
            <xs:element name="label" type="xs:string" />
            <xs:element name="message" type="xs:string" />
        </xs:sequence>
    </xs:complexType>
</xs:element>

Re: Best practice: all inline? no inline? mix?

Posted by Mike Beckerle <mb...@tresys.com>.
I think on the ":" as separator vs. terminator, when there's just 2 things I tend to go with terminator. Because there's none of this infix/prefix/postfix confusion.


That would eliminate the property dfdl:separatorPosition.


I tend to put separatorPosition="infix" in the base format, and treat any need for prefix/postfix as a local override of it. That way the concept of non-infix separators doesn't even appear and have to be understood by the schema user, unless it is needed.


Normally we think of separators are "between" i.e., separating things. Unless a format has to disabuse someone of this intuition for some reason, I think it is best to avoid separatorPosition appearing in the schema except in the base format.


Historic note: The fact that there's this concept of a prefix/postfix separator (which some would say "isn't really a separator any more") is an artifact of that idiom appearing in more than one commercial data tool back when we were coming up with the base property set for DFDL. One tool was from Mercator - company and product now a part of IBM. One tool was Microsoft BizTalk - but I think they copied it from Mercator for the same reason DFDL did. Mercator really had the biggest richest set of format properties of any tools I have seen. I think lots of data integration tools copied from their property set. We were very fortunate to have Mercator people on the DFDL work-group for several years. They had a wealth of data experience that is really hard to match.


Your perspective 2 is my favorite here.


For perspective 3....


The only excess property is dfdl:lengthUnits="characters" which unless it varies elsewhere in the schema in subtle ways, I'd want to hoist out to the base format. Even if it varies, I'd tend to have a couple named formats to use, one with the lengthUnits 'characters' and one for 'bytes" or "bits". and try to use these different base formats as the 'themes' for the different schema elements.


There are reasons, not in this little example, but in some more complex situations, for doing what you are doing in perspective 3.


The ":" is syntax, and the term I use for creating an element that captures a piece of syntax is "modeling syntax as data". Sometimes this is just the right thing, and makes a format easier to express. Or sometimes there is subtle information content in which of several delimiters was actually chosen, like it identifies what system produced the data, or identifies what version of the data format.  E.g., if you had a format where the separator could be either "," or ";" you can say dfdl:separator=", ;" and DFDL will treat them equally (though on unparse produce "," only). If however, you need to accept either, and produce the one that the data was parsed with.... then you have to model that syntax as data.


We'll have to keep our eyes out for a really motivating example of this modeling syntax as data thing. I've done it before. I just can't right now think of where. Maybe complex escape-scheme+padding examples, where neither an initiator nor an escapeScheme of type escapeBlock were quite the right thing?....


The technique is a bit of an escape valve, i.e., you can't figure out what the right subtle combination of DFDL properties is to capture your format. You have this option to just punt and start modeling the syntax features of the format as elements. It's a bit heavyweight. (Kind of like a programmer spreading their code out so every line contains exactly one procedure or function call or operator with print statement between them.) But it tends to work, and as the infoset ends up with elements that show you the syntax pieces, tends to be debuggable. It does tend to be a lot less readable in the end because more lines of schema are spent on the syntax than you'd like.

________________________________
From: Costello, Roger L. <co...@mitre.org>
Sent: Friday, November 16, 2018 1:52:10 PM
To: users@daffodil.apache.org
Subject: RE: Best practice: all inline? no inline? mix?


Thank you Mike – outstanding information!



Recall my data format:



                Label: Message



I have 3 perspectives on that format:



Perspective #1: There is a sequence of two strings, separated by a colon.



Perspective #2: There is a label terminated by a colon, followed by a message.



Perspective #3: There is a label, a colon, and a message.



Below I show the element declarations for the 3 perspectives. I am wondering if I have inlined the correct amount of DFDL stuff, per Best Practice?



Perspective #1: There is a sequence of two strings, separated by a colon.



<xs:element name="really-simple-format">
    <xs:complexType>
        <xs:sequence dfdl:separator=":" dfdl:separatorPosition="infix">
            <xs:element name="label" type="xs:string" />
            <xs:element name="message" type="xs:string" />
        </xs:sequence>
    </xs:complexType>
</xs:element>



Perspective #2: There is a label terminated by a colon, followed by a message.



<xs:element name="really-simple-format">
    <xs:complexType>
        <xs:sequence>
            <xs:element name="label" type="xs:string" dfdl:terminator=":" />
            <xs:element name="message" type="xs:string" />
        </xs:sequence>
    </xs:complexType>
</xs:element>



Perspective #3: There is a label, a colon, and a message.



<xs:element name="really-simple-format">
    <xs:complexType>
        <xs:sequence>
            <xs:element name="label" type="xs:string"
                                dfdl:lengthUnits="characters"
                                dfdl:lengthKind="pattern"
                                dfdl:lengthPattern="[\x0D-\xFF]+?(?=[:])" />
            <xs:element name="colon" type="xs:string"
                                dfdl:lengthUnits="characters"
                                dfdl:lengthKind="explicit"

                                dfdl:length="1" />
            <xs:element name="message" type="xs:string" />
        </xs:sequence>
    </xs:complexType>
</xs:element>



For schemas 1 and 2, should I inline more DFDL stuff? For schema 3, do I have too much DFDL stuff?



/Roger









From: Mike Beckerle <mb...@tresys.com>
Sent: Wednesday, November 14, 2018 4:03 PM
To: users@daffodil.apache.org
Subject: Re: Best practice: all inline? no inline? mix?



Yes, of course the mix is best.



So why... well not all properties work the same way.



Delimiters are almost always going to be locally expressed on a model-group or element.



The escape schemes used for delimiters are almost always going to be defined centrally, but where they are used, varies with the delimiter - e.g., a comma-separated sequence might allow escaping a comma embedded in an element value, but the CRLF at the end of a line might not be allowed to appear in data at all, i.e., cannot be escaped. So the escape scheme is defined centrally, but applied locally.



Policy properties like encodingErrorPolicy almost always want to be at top level.



LengthKind is a property that is quite problematic. I kind of wish we had complexType lengthKind distinguished from simpleType lengthKind becasue a very common situation in binary data wants all complex types to have dfdl:lengthKind="implicit", but simple types to have dfdl:lengthKind="explicit". And only one of those can be expressed as the default at top level. So for binary data often every single complex type element has a dfdl:ref="..." that refers to a named format that has dfdl:lengthKind="implicit". Maybe. That might not be necessary if the simple types all have type-definitions and those all say lengthKind="explicit" and are heavily reused.



Some formats, not all, are reasonably well behaved.



They have mostly similar properties used throughout. E.g., they use a single text encoding. A single byte order and bit-order, etc.



However, there are plenty of cases where quite diverse data is simply juxtaposed in a data format, so that characteristics of the data change wildly. A very common idiom is "envelope and payload" where an envelope, or header, format is used to augment a payload that is in a quite-different, and perhaps harder to access, data format. The envelope or header is often byte-aligned, byte-oriented, and well-behaved i.e., "easy" data, encapsulating a payload that is bit-oriented, bit aligned, perhaps different bit-order or byte-order, different numeric representations, etc. NACT headers before Link16 payloads  is a good example of this. MIL-STD-2045 headers before USMTF payloads is sort of the opposite example. That header format is bit-packed binary, and is quite complex/challenging, and USMTF is textual and relatively speaking, easier.



DFDL has some properties that you aren't even allowed to put in top-level scope, because that would *never* make sense. E.g., dfdl:inputValueCalc.



A practice I consider valuable is that the top level <dfdl:format.../> annotation of a DFDL schema file, should always consist of exactly and only a reference to a named format.



E.g.,



<xs:annotation><xs:appinfo...>

    <dfdl:defineFormat name="xyzFormat2">

         <dfdl:format ... all the 'top-level' basic properties for this format... />

    </dfdl:defineFormat>



    <dfdl:format ref="tns:xyzFormat2"/> <!-- use the format -->



</xs:appinfo></xs:annotation>



That way you can reuse the format in another file that extends the schema, you can build variations of it easily, etc.



Another good practice is to put the basic format definition as above here, in its own DFDL schema file that is imported by the DFDL schema files that actually define types and elements and groups.





________________________________

From: Costello, Roger L. <co...@mitre.org>>
Sent: Wednesday, November 14, 2018 2:33:43 PM
To: users@daffodil.apache.org<ma...@daffodil.apache.org>
Subject: Best practice: all inline? no inline? mix?



Hello DFDL Community,

I have a simple data format:

        Label: Message

Here is a sample input:

        Dear Sir: Thank you for your response.

I will take this perspective:

        The data format consists of a series of
        two strings (label and message),
        separated by colon.

Here is the XML that I wish to produce:

<really-simple-format>
      <label>Dear Sir</label>
      <message>Thank you for your response.</message>
</really-simple-format>

I have identified 3 approaches to design the DFDL schema:

1. Inline the DFDL stuff with the XML schema stuff.

2. Don't inline any DFDL stuff; instead, put all the DFDL stuff at the top, inside xs:annotation.

3. Inline some DFDL stuff, put some DFDL stuff at the top, inside xs:annotation.

Below I show the 3 approaches. Which is best practice? I know you will say the third approach (mix approach) is best practice. Okay, then, which DFDL stuff should be inlined and which should be put at the top, inside xs:annotation? What is the rationale for how you divvy up the DFDL stuff between inline and at the top? Do you agree with how I divvied up the DFDL stuff? Would you put more stuff inline? If so, what other stuff would you put inline?

------------------------------------
    All Inline Approach
------------------------------------

<xs:element name="really-simple-format"
    dfdl:alignment="implicit"
    dfdl:alignmentUnits="bytes"
    dfdl:encoding="UTF-8"
    dfdl:escapeSchemeRef=""
    dfdl:ignoreCase="no"
    dfdl:initiator=""
    dfdl:leadingSkip="0"
    dfdl:lengthKind="delimited"
    dfdl:outputNewLine="%CR;%LF;"
    dfdl:representation="text"
    dfdl:terminator=""
    dfdl:textPadKind="none"
    dfdl:textTrimKind="none"
    dfdl:trailingSkip="0"
    dfdl:truncateSpecifiedLengthString="no"
    >
    <xs:complexType>
        <xs:sequence
            dfdl:alignment="implicit"
            dfdl:alignmentUnits="bytes"
            dfdl:encoding="UTF-8"
            dfdl:ignoreCase="no"
            dfdl:initiatedContent="no"
            dfdl:initiator=""
            dfdl:leadingSkip="0"
            dfdl:lengthKind="delimited"
            dfdl:outputNewLine="%CR;%LF;"
            dfdl:separator=":"
            dfdl:separatorPosition="infix"
            dfdl:separatorSuppressionPolicy="never"
            dfdl:sequenceKind="ordered"
            dfdl:terminator=""
            dfdl:trailingSkip="0"
            >
            <xs:element name="label" type="xs:string"
                dfdl:alignment="implicit"
                dfdl:alignmentUnits="bytes"
                dfdl:encoding="UTF-8"
                dfdl:escapeSchemeRef=""
                dfdl:ignoreCase="no"
                dfdl:initiator=""
                dfdl:leadingSkip="0"
                dfdl:lengthKind="delimited"
                dfdl:outputNewLine="%CR;%LF;"
                dfdl:representation="text"
                dfdl:terminator=""
                dfdl:textPadKind="none"
                dfdl:textTrimKind="none"
                dfdl:trailingSkip="0"
                dfdl:truncateSpecifiedLengthString="no"
            />
            <xs:element name="message" type="xs:string"
                dfdl:alignment="implicit"
                dfdl:alignmentUnits="bytes"
                dfdl:encoding="UTF-8"
                dfdl:escapeSchemeRef=""
                dfdl:ignoreCase="no"
                dfdl:initiator=""
                dfdl:leadingSkip="0"
                dfdl:lengthKind="delimited"
                dfdl:outputNewLine="%CR;%LF;"
                dfdl:representation="text"
                dfdl:terminator=""
                dfdl:textPadKind="none"
                dfdl:textTrimKind="none"
                dfdl:trailingSkip="0"
                dfdl:truncateSpecifiedLengthString="no"
            />
        </xs:sequence>
    </xs:complexType>
</xs:element>

------------------------------------
    No Inline Approach
------------------------------------

<xs:annotation>
    <xs:appinfo source="http://www.ogf.org/dfdl/">
        <dfdl:format
            alignment="1"
            alignmentUnits="bytes"
            binaryFloatRep="ieee"
            binaryNumberRep="binary"
            bitOrder="mostSignificantBitFirst"
            byteOrder="bigEndian"
            calendarPatternKind="implicit"
            documentFinalTerminatorCanBeMissing="yes"
            emptyValueDelimiterPolicy="none"
            encoding="ISO-8859-1"
            encodingErrorPolicy="replace"
            escapeSchemeRef=""
            fillByte="f"
            floating="no"
            ignoreCase="no"
            initiator=""
            initiatedContent="no"
            leadingSkip="0"
            lengthKind="delimited"
            lengthUnits="bits"
            nilKind="literalValue"
            nilValueDelimiterPolicy="none"
            occursCountKind="implicit"
            outputNewLine="%CR;%LF;"
            representation="text"
            separator=":"
            separatorPosition="infix"
            separatorSuppressionPolicy="never"
            sequenceKind="ordered"
            terminator=""
            textBidi="no"
            textNumberCheckPolicy="strict"
            textNumberPattern="#,##0.###;-#,##0.###"
            textNumberRep="standard"
            textNumberRounding="explicit"
            textNumberRoundingIncrement="0"
            textNumberRoundingMode="roundUnnecessary"
            textOutputMinLength="0"
            textPadKind="none"
            textStandardBase="10"
            textStandardExponentRep="E"
            textStandardInfinityRep="Inf"
            textStandardNaNRep="NaN"
            textStandardZeroRep="0"
            textStandardGroupingSeparator=","
            textTrimKind="none"
            trailingSkip="0"
            truncateSpecifiedLengthString="no"
            utf16Width="fixed"
        />
    </xs:appinfo>
</xs:annotation>

<xs:element name="really-simple-format">
    <xs:complexType>
        <xs:sequence>
            <xs:element name="label" type="xs:string" />
            <xs:element name="message" type="xs:string" />
        </xs:sequence>
    </xs:complexType>
</xs:element>

------------------------------------
    Mix Approach
------------------------------------

<xs:annotation>
    <xs:appinfo source="http://www.ogf.org/dfdl/">
        <dfdl:format
            alignment="1"
            alignmentUnits="bytes"
            binaryFloatRep="ieee"
            binaryNumberRep="binary"
            bitOrder="mostSignificantBitFirst"
            byteOrder="bigEndian"
            calendarPatternKind="implicit"
            documentFinalTerminatorCanBeMissing="yes"
            emptyValueDelimiterPolicy="none"
            encoding="ISO-8859-1"
            encodingErrorPolicy="replace"
            escapeSchemeRef=""
            fillByte="f"
            floating="no"
            ignoreCase="no"
            initiator=""
            initiatedContent="no"
            leadingSkip="0"
            lengthKind="delimited"
            lengthUnits="bits"
            nilKind="literalValue"
            nilValueDelimiterPolicy="none"
            occursCountKind="implicit"
            outputNewLine="%CR;%LF;"
            representation="text"
            separatorSuppressionPolicy="never"
            sequenceKind="ordered"
            terminator=""
            textBidi="no"
            textNumberCheckPolicy="strict"
            textNumberPattern="#,##0.###;-#,##0.###"
            textNumberRep="standard"
            textNumberRounding="explicit"
            textNumberRoundingIncrement="0"
            textNumberRoundingMode="roundUnnecessary"
            textOutputMinLength="0"
            textPadKind="none"
            textStandardBase="10"
            textStandardExponentRep="E"
            textStandardInfinityRep="Inf"
            textStandardNaNRep="NaN"
            textStandardZeroRep="0"
            textStandardGroupingSeparator=","
            textTrimKind="none"
            trailingSkip="0"
            truncateSpecifiedLengthString="no"
            utf16Width="fixed"
        />
    </xs:appinfo>
</xs:annotation>

<xs:element name="really-simple-format">
    <xs:complexType>
        <xs:sequence dfdl:separator=":" dfdl:separatorPosition="infix">
            <xs:element name="label" type="xs:string" />
            <xs:element name="message" type="xs:string" />
        </xs:sequence>
    </xs:complexType>
</xs:element>

RE: Best practice: all inline? no inline? mix?

Posted by "Costello, Roger L." <co...@mitre.org>.
Thank you Mike - outstanding information!

Recall my data format:

                Label: Message

I have 3 perspectives on that format:

Perspective #1: There is a sequence of two strings, separated by a colon.

Perspective #2: There is a label terminated by a colon, followed by a message.

Perspective #3: There is a label, a colon, and a message.

Below I show the element declarations for the 3 perspectives. I am wondering if I have inlined the correct amount of DFDL stuff, per Best Practice?

Perspective #1: There is a sequence of two strings, separated by a colon.

<xs:element name="really-simple-format">
    <xs:complexType>
        <xs:sequence dfdl:separator=":" dfdl:separatorPosition="infix">
            <xs:element name="label" type="xs:string" />
            <xs:element name="message" type="xs:string" />
        </xs:sequence>
    </xs:complexType>
</xs:element>

Perspective #2: There is a label terminated by a colon, followed by a message.

<xs:element name="really-simple-format">
    <xs:complexType>
        <xs:sequence>
            <xs:element name="label" type="xs:string" dfdl:terminator=":" />
            <xs:element name="message" type="xs:string" />
        </xs:sequence>
    </xs:complexType>
</xs:element>

Perspective #3: There is a label, a colon, and a message.

<xs:element name="really-simple-format">
    <xs:complexType>
        <xs:sequence>
            <xs:element name="label" type="xs:string"
                                dfdl:lengthUnits="characters"
                                dfdl:lengthKind="pattern"
                                dfdl:lengthPattern="[\x0D-\xFF]+?(?=[:])" />
            <xs:element name="colon" type="xs:string"
                                dfdl:lengthUnits="characters"
                                dfdl:lengthKind="explicit"
                                dfdl:length="1" />
            <xs:element name="message" type="xs:string" />
        </xs:sequence>
    </xs:complexType>
</xs:element>

For schemas 1 and 2, should I inline more DFDL stuff? For schema 3, do I have too much DFDL stuff?

/Roger




From: Mike Beckerle <mb...@tresys.com>
Sent: Wednesday, November 14, 2018 4:03 PM
To: users@daffodil.apache.org
Subject: Re: Best practice: all inline? no inline? mix?


Yes, of course the mix is best.



So why... well not all properties work the same way.



Delimiters are almost always going to be locally expressed on a model-group or element.



The escape schemes used for delimiters are almost always going to be defined centrally, but where they are used, varies with the delimiter - e.g., a comma-separated sequence might allow escaping a comma embedded in an element value, but the CRLF at the end of a line might not be allowed to appear in data at all, i.e., cannot be escaped. So the escape scheme is defined centrally, but applied locally.



Policy properties like encodingErrorPolicy almost always want to be at top level.



LengthKind is a property that is quite problematic. I kind of wish we had complexType lengthKind distinguished from simpleType lengthKind becasue a very common situation in binary data wants all complex types to have dfdl:lengthKind="implicit", but simple types to have dfdl:lengthKind="explicit". And only one of those can be expressed as the default at top level. So for binary data often every single complex type element has a dfdl:ref="..." that refers to a named format that has dfdl:lengthKind="implicit". Maybe. That might not be necessary if the simple types all have type-definitions and those all say lengthKind="explicit" and are heavily reused.



Some formats, not all, are reasonably well behaved.



They have mostly similar properties used throughout. E.g., they use a single text encoding. A single byte order and bit-order, etc.



However, there are plenty of cases where quite diverse data is simply juxtaposed in a data format, so that characteristics of the data change wildly. A very common idiom is "envelope and payload" where an envelope, or header, format is used to augment a payload that is in a quite-different, and perhaps harder to access, data format. The envelope or header is often byte-aligned, byte-oriented, and well-behaved i.e., "easy" data, encapsulating a payload that is bit-oriented, bit aligned, perhaps different bit-order or byte-order, different numeric representations, etc. NACT headers before Link16 payloads  is a good example of this. MIL-STD-2045 headers before USMTF payloads is sort of the opposite example. That header format is bit-packed binary, and is quite complex/challenging, and USMTF is textual and relatively speaking, easier.



DFDL has some properties that you aren't even allowed to put in top-level scope, because that would *never* make sense. E.g., dfdl:inputValueCalc.



A practice I consider valuable is that the top level <dfdl:format.../> annotation of a DFDL schema file, should always consist of exactly and only a reference to a named format.



E.g.,



<xs:annotation><xs:appinfo...>

    <dfdl:defineFormat name="xyzFormat2">

         <dfdl:format ... all the 'top-level' basic properties for this format... />

    </dfdl:defineFormat>



    <dfdl:format ref="tns:xyzFormat2"/> <!-- use the format -->



</xs:appinfo></xs:annotation>



That way you can reuse the format in another file that extends the schema, you can build variations of it easily, etc.



Another good practice is to put the basic format definition as above here, in its own DFDL schema file that is imported by the DFDL schema files that actually define types and elements and groups.





________________________________
From: Costello, Roger L. <co...@mitre.org>>
Sent: Wednesday, November 14, 2018 2:33:43 PM
To: users@daffodil.apache.org<ma...@daffodil.apache.org>
Subject: Best practice: all inline? no inline? mix?

Hello DFDL Community,

I have a simple data format:

        Label: Message

Here is a sample input:

        Dear Sir: Thank you for your response.

I will take this perspective:

        The data format consists of a series of
        two strings (label and message),
        separated by colon.

Here is the XML that I wish to produce:

<really-simple-format>
      <label>Dear Sir</label>
      <message>Thank you for your response.</message>
</really-simple-format>

I have identified 3 approaches to design the DFDL schema:

1. Inline the DFDL stuff with the XML schema stuff.

2. Don't inline any DFDL stuff; instead, put all the DFDL stuff at the top, inside xs:annotation.

3. Inline some DFDL stuff, put some DFDL stuff at the top, inside xs:annotation.

Below I show the 3 approaches. Which is best practice? I know you will say the third approach (mix approach) is best practice. Okay, then, which DFDL stuff should be inlined and which should be put at the top, inside xs:annotation? What is the rationale for how you divvy up the DFDL stuff between inline and at the top? Do you agree with how I divvied up the DFDL stuff? Would you put more stuff inline? If so, what other stuff would you put inline?

------------------------------------
    All Inline Approach
------------------------------------

<xs:element name="really-simple-format"
    dfdl:alignment="implicit"
    dfdl:alignmentUnits="bytes"
    dfdl:encoding="UTF-8"
    dfdl:escapeSchemeRef=""
    dfdl:ignoreCase="no"
    dfdl:initiator=""
    dfdl:leadingSkip="0"
    dfdl:lengthKind="delimited"
    dfdl:outputNewLine="%CR;%LF;"
    dfdl:representation="text"
    dfdl:terminator=""
    dfdl:textPadKind="none"
    dfdl:textTrimKind="none"
    dfdl:trailingSkip="0"
    dfdl:truncateSpecifiedLengthString="no"
    >
    <xs:complexType>
        <xs:sequence
            dfdl:alignment="implicit"
            dfdl:alignmentUnits="bytes"
            dfdl:encoding="UTF-8"
            dfdl:ignoreCase="no"
            dfdl:initiatedContent="no"
            dfdl:initiator=""
            dfdl:leadingSkip="0"
            dfdl:lengthKind="delimited"
            dfdl:outputNewLine="%CR;%LF;"
            dfdl:separator=":"
            dfdl:separatorPosition="infix"
            dfdl:separatorSuppressionPolicy="never"
            dfdl:sequenceKind="ordered"
            dfdl:terminator=""
            dfdl:trailingSkip="0"
            >
            <xs:element name="label" type="xs:string"
                dfdl:alignment="implicit"
                dfdl:alignmentUnits="bytes"
                dfdl:encoding="UTF-8"
                dfdl:escapeSchemeRef=""
                dfdl:ignoreCase="no"
                dfdl:initiator=""
                dfdl:leadingSkip="0"
                dfdl:lengthKind="delimited"
                dfdl:outputNewLine="%CR;%LF;"
                dfdl:representation="text"
                dfdl:terminator=""
                dfdl:textPadKind="none"
                dfdl:textTrimKind="none"
                dfdl:trailingSkip="0"
                dfdl:truncateSpecifiedLengthString="no"
            />
            <xs:element name="message" type="xs:string"
                dfdl:alignment="implicit"
                dfdl:alignmentUnits="bytes"
                dfdl:encoding="UTF-8"
                dfdl:escapeSchemeRef=""
                dfdl:ignoreCase="no"
                dfdl:initiator=""
                dfdl:leadingSkip="0"
                dfdl:lengthKind="delimited"
                dfdl:outputNewLine="%CR;%LF;"
                dfdl:representation="text"
                dfdl:terminator=""
                dfdl:textPadKind="none"
                dfdl:textTrimKind="none"
                dfdl:trailingSkip="0"
                dfdl:truncateSpecifiedLengthString="no"
            />
        </xs:sequence>
    </xs:complexType>
</xs:element>

------------------------------------
    No Inline Approach
------------------------------------

<xs:annotation>
    <xs:appinfo source="http://www.ogf.org/dfdl/">
        <dfdl:format
            alignment="1"
            alignmentUnits="bytes"
            binaryFloatRep="ieee"
            binaryNumberRep="binary"
            bitOrder="mostSignificantBitFirst"
            byteOrder="bigEndian"
            calendarPatternKind="implicit"
            documentFinalTerminatorCanBeMissing="yes"
            emptyValueDelimiterPolicy="none"
            encoding="ISO-8859-1"
            encodingErrorPolicy="replace"
            escapeSchemeRef=""
            fillByte="f"
            floating="no"
            ignoreCase="no"
            initiator=""
            initiatedContent="no"
            leadingSkip="0"
            lengthKind="delimited"
            lengthUnits="bits"
            nilKind="literalValue"
            nilValueDelimiterPolicy="none"
            occursCountKind="implicit"
            outputNewLine="%CR;%LF;"
            representation="text"
            separator=":"
            separatorPosition="infix"
            separatorSuppressionPolicy="never"
            sequenceKind="ordered"
            terminator=""
            textBidi="no"
            textNumberCheckPolicy="strict"
            textNumberPattern="#,##0.###;-#,##0.###"
            textNumberRep="standard"
            textNumberRounding="explicit"
            textNumberRoundingIncrement="0"
            textNumberRoundingMode="roundUnnecessary"
            textOutputMinLength="0"
            textPadKind="none"
            textStandardBase="10"
            textStandardExponentRep="E"
            textStandardInfinityRep="Inf"
            textStandardNaNRep="NaN"
            textStandardZeroRep="0"
            textStandardGroupingSeparator=","
            textTrimKind="none"
            trailingSkip="0"
            truncateSpecifiedLengthString="no"
            utf16Width="fixed"
        />
    </xs:appinfo>
</xs:annotation>

<xs:element name="really-simple-format">
    <xs:complexType>
        <xs:sequence>
            <xs:element name="label" type="xs:string" />
            <xs:element name="message" type="xs:string" />
        </xs:sequence>
    </xs:complexType>
</xs:element>

------------------------------------
    Mix Approach
------------------------------------

<xs:annotation>
    <xs:appinfo source="http://www.ogf.org/dfdl/">
        <dfdl:format
            alignment="1"
            alignmentUnits="bytes"
            binaryFloatRep="ieee"
            binaryNumberRep="binary"
            bitOrder="mostSignificantBitFirst"
            byteOrder="bigEndian"
            calendarPatternKind="implicit"
            documentFinalTerminatorCanBeMissing="yes"
            emptyValueDelimiterPolicy="none"
            encoding="ISO-8859-1"
            encodingErrorPolicy="replace"
            escapeSchemeRef=""
            fillByte="f"
            floating="no"
            ignoreCase="no"
            initiator=""
            initiatedContent="no"
            leadingSkip="0"
            lengthKind="delimited"
            lengthUnits="bits"
            nilKind="literalValue"
            nilValueDelimiterPolicy="none"
            occursCountKind="implicit"
            outputNewLine="%CR;%LF;"
            representation="text"
            separatorSuppressionPolicy="never"
            sequenceKind="ordered"
            terminator=""
            textBidi="no"
            textNumberCheckPolicy="strict"
            textNumberPattern="#,##0.###;-#,##0.###"
            textNumberRep="standard"
            textNumberRounding="explicit"
            textNumberRoundingIncrement="0"
            textNumberRoundingMode="roundUnnecessary"
            textOutputMinLength="0"
            textPadKind="none"
            textStandardBase="10"
            textStandardExponentRep="E"
            textStandardInfinityRep="Inf"
            textStandardNaNRep="NaN"
            textStandardZeroRep="0"
            textStandardGroupingSeparator=","
            textTrimKind="none"
            trailingSkip="0"
            truncateSpecifiedLengthString="no"
            utf16Width="fixed"
        />
    </xs:appinfo>
</xs:annotation>

<xs:element name="really-simple-format">
    <xs:complexType>
        <xs:sequence dfdl:separator=":" dfdl:separatorPosition="infix">
            <xs:element name="label" type="xs:string" />
            <xs:element name="message" type="xs:string" />
        </xs:sequence>
    </xs:complexType>
</xs:element>

Re: Best practice: all inline? no inline? mix?

Posted by Mike Beckerle <mb...@tresys.com>.
Yes, of course the mix is best.


So why... well not all properties work the same way.


Delimiters are almost always going to be locally expressed on a model-group or element.


The escape schemes used for delimiters are almost always going to be defined centrally, but where they are used, varies with the delimiter - e.g., a comma-separated sequence might allow escaping a comma embedded in an element value, but the CRLF at the end of a line might not be allowed to appear in data at all, i.e., cannot be escaped. So the escape scheme is defined centrally, but applied locally.


Policy properties like encodingErrorPolicy almost always want to be at top level.


LengthKind is a property that is quite problematic. I kind of wish we had complexType lengthKind distinguished from simpleType lengthKind becasue a very common situation in binary data wants all complex types to have dfdl:lengthKind="implicit", but simple types to have dfdl:lengthKind="explicit". And only one of those can be expressed as the default at top level. So for binary data often every single complex type element has a dfdl:ref="..." that refers to a named format that has dfdl:lengthKind="implicit". Maybe. That might not be necessary if the simple types all have type-definitions and those all say lengthKind="explicit" and are heavily reused.


Some formats, not all, are reasonably well behaved.


They have mostly similar properties used throughout. E.g., they use a single text encoding. A single byte order and bit-order, etc.


However, there are plenty of cases where quite diverse data is simply juxtaposed in a data format, so that characteristics of the data change wildly. A very common idiom is "envelope and payload" where an envelope, or header, format is used to augment a payload that is in a quite-different, and perhaps harder to access, data format. The envelope or header is often byte-aligned, byte-oriented, and well-behaved i.e., "easy" data, encapsulating a payload that is bit-oriented, bit aligned, perhaps different bit-order or byte-order, different numeric representations, etc. NACT headers before Link16 payloads  is a good example of this. MIL-STD-2045 headers before USMTF payloads is sort of the opposite example. That header format is bit-packed binary, and is quite complex/challenging, and USMTF is textual and relatively speaking, easier.


DFDL has some properties that you aren't even allowed to put in top-level scope, because that would *never* make sense. E.g., dfdl:inputValueCalc.


A practice I consider valuable is that the top level <dfdl:format.../> annotation of a DFDL schema file, should always consist of exactly and only a reference to a named format.


E.g.,


<xs:annotation><xs:appinfo...>

    <dfdl:defineFormat name="xyzFormat2">

         <dfdl:format ... all the 'top-level' basic properties for this format... />

    </dfdl:defineFormat>


    <dfdl:format ref="tns:xyzFormat2"/> <!-- use the format -->


</xs:appinfo></xs:annotation>


That way you can reuse the format in another file that extends the schema, you can build variations of it easily, etc.


Another good practice is to put the basic format definition as above here, in its own DFDL schema file that is imported by the DFDL schema files that actually define types and elements and groups.



________________________________
From: Costello, Roger L. <co...@mitre.org>
Sent: Wednesday, November 14, 2018 2:33:43 PM
To: users@daffodil.apache.org
Subject: Best practice: all inline? no inline? mix?

Hello DFDL Community,

I have a simple data format:

        Label: Message

Here is a sample input:

        Dear Sir: Thank you for your response.

I will take this perspective:

        The data format consists of a series of
        two strings (label and message),
        separated by colon.

Here is the XML that I wish to produce:

<really-simple-format>
      <label>Dear Sir</label>
      <message>Thank you for your response.</message>
</really-simple-format>

I have identified 3 approaches to design the DFDL schema:

1. Inline the DFDL stuff with the XML schema stuff.

2. Don't inline any DFDL stuff; instead, put all the DFDL stuff at the top, inside xs:annotation.

3. Inline some DFDL stuff, put some DFDL stuff at the top, inside xs:annotation.

Below I show the 3 approaches. Which is best practice? I know you will say the third approach (mix approach) is best practice. Okay, then, which DFDL stuff should be inlined and which should be put at the top, inside xs:annotation? What is the rationale for how you divvy up the DFDL stuff between inline and at the top? Do you agree with how I divvied up the DFDL stuff? Would you put more stuff inline? If so, what other stuff would you put inline?

------------------------------------
    All Inline Approach
------------------------------------

<xs:element name="really-simple-format"
    dfdl:alignment="implicit"
    dfdl:alignmentUnits="bytes"
    dfdl:encoding="UTF-8"
    dfdl:escapeSchemeRef=""
    dfdl:ignoreCase="no"
    dfdl:initiator=""
    dfdl:leadingSkip="0"
    dfdl:lengthKind="delimited"
    dfdl:outputNewLine="%CR;%LF;"
    dfdl:representation="text"
    dfdl:terminator=""
    dfdl:textPadKind="none"
    dfdl:textTrimKind="none"
    dfdl:trailingSkip="0"
    dfdl:truncateSpecifiedLengthString="no"
    >
    <xs:complexType>
        <xs:sequence
            dfdl:alignment="implicit"
            dfdl:alignmentUnits="bytes"
            dfdl:encoding="UTF-8"
            dfdl:ignoreCase="no"
            dfdl:initiatedContent="no"
            dfdl:initiator=""
            dfdl:leadingSkip="0"
            dfdl:lengthKind="delimited"
            dfdl:outputNewLine="%CR;%LF;"
            dfdl:separator=":"
            dfdl:separatorPosition="infix"
            dfdl:separatorSuppressionPolicy="never"
            dfdl:sequenceKind="ordered"
            dfdl:terminator=""
            dfdl:trailingSkip="0"
            >
            <xs:element name="label" type="xs:string"
                dfdl:alignment="implicit"
                dfdl:alignmentUnits="bytes"
                dfdl:encoding="UTF-8"
                dfdl:escapeSchemeRef=""
                dfdl:ignoreCase="no"
                dfdl:initiator=""
                dfdl:leadingSkip="0"
                dfdl:lengthKind="delimited"
                dfdl:outputNewLine="%CR;%LF;"
                dfdl:representation="text"
                dfdl:terminator=""
                dfdl:textPadKind="none"
                dfdl:textTrimKind="none"
                dfdl:trailingSkip="0"
                dfdl:truncateSpecifiedLengthString="no"
            />
            <xs:element name="message" type="xs:string"
                dfdl:alignment="implicit"
                dfdl:alignmentUnits="bytes"
                dfdl:encoding="UTF-8"
                dfdl:escapeSchemeRef=""
                dfdl:ignoreCase="no"
                dfdl:initiator=""
                dfdl:leadingSkip="0"
                dfdl:lengthKind="delimited"
                dfdl:outputNewLine="%CR;%LF;"
                dfdl:representation="text"
                dfdl:terminator=""
                dfdl:textPadKind="none"
                dfdl:textTrimKind="none"
                dfdl:trailingSkip="0"
                dfdl:truncateSpecifiedLengthString="no"
            />
        </xs:sequence>
    </xs:complexType>
</xs:element>

------------------------------------
    No Inline Approach
------------------------------------

<xs:annotation>
    <xs:appinfo source="http://www.ogf.org/dfdl/">
        <dfdl:format
            alignment="1"
            alignmentUnits="bytes"
            binaryFloatRep="ieee"
            binaryNumberRep="binary"
            bitOrder="mostSignificantBitFirst"
            byteOrder="bigEndian"
            calendarPatternKind="implicit"
            documentFinalTerminatorCanBeMissing="yes"
            emptyValueDelimiterPolicy="none"
            encoding="ISO-8859-1"
            encodingErrorPolicy="replace"
            escapeSchemeRef=""
            fillByte="f"
            floating="no"
            ignoreCase="no"
            initiator=""
            initiatedContent="no"
            leadingSkip="0"
            lengthKind="delimited"
            lengthUnits="bits"
            nilKind="literalValue"
            nilValueDelimiterPolicy="none"
            occursCountKind="implicit"
            outputNewLine="%CR;%LF;"
            representation="text"
            separator=":"
            separatorPosition="infix"
            separatorSuppressionPolicy="never"
            sequenceKind="ordered"
            terminator=""
            textBidi="no"
            textNumberCheckPolicy="strict"
            textNumberPattern="#,##0.###;-#,##0.###"
            textNumberRep="standard"
            textNumberRounding="explicit"
            textNumberRoundingIncrement="0"
            textNumberRoundingMode="roundUnnecessary"
            textOutputMinLength="0"
            textPadKind="none"
            textStandardBase="10"
            textStandardExponentRep="E"
            textStandardInfinityRep="Inf"
            textStandardNaNRep="NaN"
            textStandardZeroRep="0"
            textStandardGroupingSeparator=","
            textTrimKind="none"
            trailingSkip="0"
            truncateSpecifiedLengthString="no"
            utf16Width="fixed"
        />
    </xs:appinfo>
</xs:annotation>

<xs:element name="really-simple-format">
    <xs:complexType>
        <xs:sequence>
            <xs:element name="label" type="xs:string" />
            <xs:element name="message" type="xs:string" />
        </xs:sequence>
    </xs:complexType>
</xs:element>

------------------------------------
    Mix Approach
------------------------------------

<xs:annotation>
    <xs:appinfo source="http://www.ogf.org/dfdl/">
        <dfdl:format
            alignment="1"
            alignmentUnits="bytes"
            binaryFloatRep="ieee"
            binaryNumberRep="binary"
            bitOrder="mostSignificantBitFirst"
            byteOrder="bigEndian"
            calendarPatternKind="implicit"
            documentFinalTerminatorCanBeMissing="yes"
            emptyValueDelimiterPolicy="none"
            encoding="ISO-8859-1"
            encodingErrorPolicy="replace"
            escapeSchemeRef=""
            fillByte="f"
            floating="no"
            ignoreCase="no"
            initiator=""
            initiatedContent="no"
            leadingSkip="0"
            lengthKind="delimited"
            lengthUnits="bits"
            nilKind="literalValue"
            nilValueDelimiterPolicy="none"
            occursCountKind="implicit"
            outputNewLine="%CR;%LF;"
            representation="text"
            separatorSuppressionPolicy="never"
            sequenceKind="ordered"
            terminator=""
            textBidi="no"
            textNumberCheckPolicy="strict"
            textNumberPattern="#,##0.###;-#,##0.###"
            textNumberRep="standard"
            textNumberRounding="explicit"
            textNumberRoundingIncrement="0"
            textNumberRoundingMode="roundUnnecessary"
            textOutputMinLength="0"
            textPadKind="none"
            textStandardBase="10"
            textStandardExponentRep="E"
            textStandardInfinityRep="Inf"
            textStandardNaNRep="NaN"
            textStandardZeroRep="0"
            textStandardGroupingSeparator=","
            textTrimKind="none"
            trailingSkip="0"
            truncateSpecifiedLengthString="no"
            utf16Width="fixed"
        />
    </xs:appinfo>
</xs:annotation>

<xs:element name="really-simple-format">
    <xs:complexType>
        <xs:sequence dfdl:separator=":" dfdl:separatorPosition="infix">
            <xs:element name="label" type="xs:string" />
            <xs:element name="message" type="xs:string" />
        </xs:sequence>
    </xs:complexType>
</xs:element>