You are viewing a plain text version of this content. The canonical link for it is here.
Posted to c-dev@xerces.apache.org by Mark Everline <ma...@level8.com> on 2001/02/12 20:23:24 UTC

Core when UTF-16 encoding Controdicts actual encoding.....

Hi,

In any message when the Encoding controdicts the actuall encoding the
the parser core dumps because the XMLReader::setEncoding(...) deletes
the fEncodingStr before error check and leaving the variable in a
invalid
state. 

Problem: I have a customer whom decicded to do the following:
   <?xml version="1.0" encoding="UTF-16"?>

the only problem is the actuall encoding is UTF-8. When the parser reads
in the file you get the following warning:

Warning at file
"/home/ox3/snap/everline/snap/sassaby/snap/s/tsto/xml/input/bad/Tina_Koslovsky.xml",
line 1, column 40
   Message: Encoding (UTF-16, from XMLDecl or manually set) contradicts
the auto-sensed encoding, ignoring it

the parser then latter core dumps when trieing to set the Transcoder
because 
the fEncodingStr is 0. I have attached the xml file and the fix to
XMLReader.
Not sure weather this is a bug/problem or not mainly because the XML is 
questionable in the first place. Also I am using the DOMParser. 

    

     Mark.


Changes To XMLReader.cpp:  ( marked with  // mee )

bool XMLReader::setEncoding(const XMLCh* const newEncoding)
{
    //
    //  If the encoding was forced, then we ignore the new value and
just
    //  return with success. If it was forced, then we are to use that
    //  encoding without question. Note that, if we are forced, we
created
    //  a transcoder up front so there is no need to do one here in that
    //  case.
    //
    if (fForcedEncoding)
        return true;

    // Clean up the old encoding string
// mee - don't delete until we known we have a good encoding. 
//    if (fEncodingStr)
//    {
//        delete [] fEncodingStr;
//        fEncodingStr = 0;
//   }

    //
    //  Try to map the string to one of our standard encodings. If its
not
    //  one of them, then it has to be one of the non-intrinsic
encodings,
    //  in which case we have to delete our intrinsic encoder and create
a
    //  new one.
    //
    XMLRecognizer::Encodings newBaseEncoding =
XMLRecognizer::encodingForName
    (
        newEncoding
    );

    //
    //  If it does not come back as one of the auto-sensed encodings,
then we
    //  have to possibly replace it and at least check a few things.
    //
    if (newBaseEncoding == XMLRecognizer::OtherEncoding)
    {
        //
        //  Check for non-endian specific UTF-16 or UCS-4. If so, and if
we
        //  are already in one of the endian versions of those
encodings,
        //  then just keep it and go on. Otherwise, its not valid.
        //
        if (!XMLString::compareIString(newEncoding,
XMLUni::fgUTF16EncodingString)
        ||  !XMLString::compareIString(newEncoding,
XMLUni::fgUTF16EncodingString2)
        ||  !XMLString::compareIString(newEncoding,
XMLUni::fgUTF16EncodingString3)
        ||  !XMLString::compareIString(newEncoding,
XMLUni::fgUTF16EncodingString4))
        {
            if ((fEncoding != XMLRecognizer::UTF_16L)
            &&  (fEncoding != XMLRecognizer::UTF_16B))
            {
                return false;
            }

            // Override with the original endian specific encoding
            newBaseEncoding = fEncoding;

            if (fEncoding == XMLRecognizer::UTF_16L) {
	       delete [] fEncodingStr;   // mee
               fEncodingStr =
XMLString::replicate(XMLUni::fgUTF16LEncodingString);
	    }
            else {
	       delete [] fEncodingStr;   // mee
               fEncodingStr =
XMLString::replicate(XMLUni::fgUTF16BEncodingString);
	    }
        }
         else if (!XMLString::compareIString(newEncoding,
XMLUni::fgUCS4EncodingString)
              ||  !XMLString::compareIString(newEncoding,
XMLUni::fgUCS4EncodingString2)
              ||  !XMLString::compareIString(newEncoding,
XMLUni::fgUCS4EncodingString3))
        {
            if ((fEncoding != XMLRecognizer::UCS_4L)
            &&  (fEncoding != XMLRecognizer::UCS_4B))
            {
                return false;
            }

            // Override with the original endian specific encoding
            newBaseEncoding = fEncoding;

            if (fEncoding == XMLRecognizer::UCS_4L) {
	       delete [] fEncodingStr;   // mee
               fEncodingStr =
XMLString::replicate(XMLUni::fgUCS4LEncodingString);
	    }
            else {
	       delete [] fEncodingStr;   // mee
               fEncodingStr =
XMLString::replicate(XMLUni::fgUCS4BEncodingString);
	    }
        }
         else
        {
            // None of those special cases, so just replicate the new
name
	    delete [] fEncodingStr;  // mee
            fEncodingStr = XMLString::replicate(newEncoding);
        }
    }
     else
    {
        // Store the new encoding string since it is just an intrinsic
        delete [] fEncodingStr;  // mee
        fEncodingStr = XMLString::replicate(newEncoding);
    }

    //
    //  Now we can create a transcoder using the transcoding service. We
    //  might get back a transcoder for an intrinsically supported
encoding,
    //  or we might get one from the underlying transcoding service.
    //
    XMLTransService::Codes failReason;
    fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
    (
        fEncodingStr
        , failReason
        , kCharBufSize
    );

    if (!fTranscoder)
        ThrowXML1(TranscodingException,
XMLExcepts::Trans_CantCreateCvtrFor, fEncodingStr);

    // Update the base encoding member with the new base encoding found
    fEncoding = newBaseEncoding;

    // Looks ok to us
    return true;
}


Test .xml File: (and yes I known there are other problems with the XML)

<?xml version="1.0" encoding="UTF-16"?>

<!DOCTYPE OBJECTS [
	<!ELEMENT OBJECTS ( CUSTOMER|ORDER)*>

	<!ELEMENT CUSTOMER EMPTY >

	<!ATTLIST CUSTOMER
		Orders 		IDREFS #IMPLIED
		Id 		CDATA #IMPLIED
		FirstName 		CDATA #IMPLIED
		LastName 		CDATA #IMPLIED
		CompanyName 		CDATA #IMPLIED
		__objName ID #IMPLIED>

	<!ELEMENT ORDER EMPTY >

	<!ATTLIST ORDER
		Id 		CDATA #IMPLIED
		ArticleName 		CDATA #IMPLIED
		Amount 		CDATA #IMPLIED
		__objName ID #IMPLIED>

]>


	<OBJECTS>
	<CUSTOMER 
		 __objName="CUSTOMER__"
		 Orders="ORDER__1 ORDER__2 "
		 Id="3"
		 FirstName="Tina"
		 LastName="Koslovsky"
		 CompanyName="Irgendwo GmbH&oKG"/>
	<ORDER 
		 __objName="ORDER__1"
		 Id="6"
		 ArticleName="Fruchtquark"
		 Amount="45"/>
	<ORDER 
		 __objName="ORDER__2"
		 Id="7"
		 ArticleName="Romadour"
		 Amount="20"/>
	</OBJECTS>


-- 
#-------------------------------------------------------------------------
# Phone: 703-925-1487(w)                # 703-729-4463(h)
# email: mark.everline@level8.com (w)   # meverline@loudoun.com (h) 
#-------------------------------------------------------------------------
#
# Any day you get to breath underwater is a good day.
#                  - A Scuba Divers motto 
#
#-------------------------------------------------------------------------

Re: Core when UTF-16 encoding Controdicts actual encoding.....

Posted by Khaled Noaman <kn...@ca.ibm.com>.
Mark,

The fix is now in CVS. Thanks

Khaled

Mark Everline wrote:

> Hi,
>
> In any message when the Encoding controdicts the actuall encoding the
> the parser core dumps because the XMLReader::setEncoding(...) deletes
> the fEncodingStr before error check and leaving the variable in a
> invalid
> state.
>
> Problem: I have a customer whom decicded to do the following:
>    <?xml version="1.0" encoding="UTF-16"?>
>
> the only problem is the actuall encoding is UTF-8. When the parser reads
> in the file you get the following warning:
>
> Warning at file
> "/home/ox3/snap/everline/snap/sassaby/snap/s/tsto/xml/input/bad/Tina_Koslovsky.xml",
> line 1, column 40
>    Message: Encoding (UTF-16, from XMLDecl or manually set) contradicts
> the auto-sensed encoding, ignoring it
>
> the parser then latter core dumps when trieing to set the Transcoder
> because
> the fEncodingStr is 0. I have attached the xml file and the fix to
> XMLReader.
> Not sure weather this is a bug/problem or not mainly because the XML is
> questionable in the first place. Also I am using the DOMParser.
>
>
>
>      Mark.
>
> Changes To XMLReader.cpp:  ( marked with  // mee )
>
> bool XMLReader::setEncoding(const XMLCh* const newEncoding)
> {
>     //
>     //  If the encoding was forced, then we ignore the new value and
> just
>     //  return with success. If it was forced, then we are to use that
>     //  encoding without question. Note that, if we are forced, we
> created
>     //  a transcoder up front so there is no need to do one here in that
>     //  case.
>     //
>     if (fForcedEncoding)
>         return true;
>
>     // Clean up the old encoding string
> // mee - don't delete until we known we have a good encoding.
> //    if (fEncodingStr)
> //    {
> //        delete [] fEncodingStr;
> //        fEncodingStr = 0;
> //   }
>
>     //
>     //  Try to map the string to one of our standard encodings. If its
> not
>     //  one of them, then it has to be one of the non-intrinsic
> encodings,
>     //  in which case we have to delete our intrinsic encoder and create
> a
>     //  new one.
>     //
>     XMLRecognizer::Encodings newBaseEncoding =
> XMLRecognizer::encodingForName
>     (
>         newEncoding
>     );
>
>     //
>     //  If it does not come back as one of the auto-sensed encodings,
> then we
>     //  have to possibly replace it and at least check a few things.
>     //
>     if (newBaseEncoding == XMLRecognizer::OtherEncoding)
>     {
>         //
>         //  Check for non-endian specific UTF-16 or UCS-4. If so, and if
> we
>         //  are already in one of the endian versions of those
> encodings,
>         //  then just keep it and go on. Otherwise, its not valid.
>         //
>         if (!XMLString::compareIString(newEncoding,
> XMLUni::fgUTF16EncodingString)
>         ||  !XMLString::compareIString(newEncoding,
> XMLUni::fgUTF16EncodingString2)
>         ||  !XMLString::compareIString(newEncoding,
> XMLUni::fgUTF16EncodingString3)
>         ||  !XMLString::compareIString(newEncoding,
> XMLUni::fgUTF16EncodingString4))
>         {
>             if ((fEncoding != XMLRecognizer::UTF_16L)
>             &&  (fEncoding != XMLRecognizer::UTF_16B))
>             {
>                 return false;
>             }
>
>             // Override with the original endian specific encoding
>             newBaseEncoding = fEncoding;
>
>             if (fEncoding == XMLRecognizer::UTF_16L) {
>                delete [] fEncodingStr;   // mee
>                fEncodingStr =
> XMLString::replicate(XMLUni::fgUTF16LEncodingString);
>             }
>             else {
>                delete [] fEncodingStr;   // mee
>                fEncodingStr =
> XMLString::replicate(XMLUni::fgUTF16BEncodingString);
>             }
>         }
>          else if (!XMLString::compareIString(newEncoding,
> XMLUni::fgUCS4EncodingString)
>               ||  !XMLString::compareIString(newEncoding,
> XMLUni::fgUCS4EncodingString2)
>               ||  !XMLString::compareIString(newEncoding,
> XMLUni::fgUCS4EncodingString3))
>         {
>             if ((fEncoding != XMLRecognizer::UCS_4L)
>             &&  (fEncoding != XMLRecognizer::UCS_4B))
>             {
>                 return false;
>             }
>
>             // Override with the original endian specific encoding
>             newBaseEncoding = fEncoding;
>
>             if (fEncoding == XMLRecognizer::UCS_4L) {
>                delete [] fEncodingStr;   // mee
>                fEncodingStr =
> XMLString::replicate(XMLUni::fgUCS4LEncodingString);
>             }
>             else {
>                delete [] fEncodingStr;   // mee
>                fEncodingStr =
> XMLString::replicate(XMLUni::fgUCS4BEncodingString);
>             }
>         }
>          else
>         {
>             // None of those special cases, so just replicate the new
> name
>             delete [] fEncodingStr;  // mee
>             fEncodingStr = XMLString::replicate(newEncoding);
>         }
>     }
>      else
>     {
>         // Store the new encoding string since it is just an intrinsic
>         delete [] fEncodingStr;  // mee
>         fEncodingStr = XMLString::replicate(newEncoding);
>     }
>
>     //
>     //  Now we can create a transcoder using the transcoding service. We
>     //  might get back a transcoder for an intrinsically supported
> encoding,
>     //  or we might get one from the underlying transcoding service.
>     //
>     XMLTransService::Codes failReason;
>     fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
>     (
>         fEncodingStr
>         , failReason
>         , kCharBufSize
>     );
>
>     if (!fTranscoder)
>         ThrowXML1(TranscodingException,
> XMLExcepts::Trans_CantCreateCvtrFor, fEncodingStr);
>
>     // Update the base encoding member with the new base encoding found
>     fEncoding = newBaseEncoding;
>
>     // Looks ok to us
>     return true;
> }
>
> Test .xml File: (and yes I known there are other problems with the XML)
>
> <?xml version="1.0" encoding="UTF-16"?>
>
> <!DOCTYPE OBJECTS [
>         <!ELEMENT OBJECTS ( CUSTOMER|ORDER)*>
>
>         <!ELEMENT CUSTOMER EMPTY >
>
>         <!ATTLIST CUSTOMER
>                 Orders          IDREFS #IMPLIED
>                 Id              CDATA #IMPLIED
>                 FirstName               CDATA #IMPLIED
>                 LastName                CDATA #IMPLIED
>                 CompanyName             CDATA #IMPLIED
>                 __objName ID #IMPLIED>
>
>         <!ELEMENT ORDER EMPTY >
>
>         <!ATTLIST ORDER
>                 Id              CDATA #IMPLIED
>                 ArticleName             CDATA #IMPLIED
>                 Amount          CDATA #IMPLIED
>                 __objName ID #IMPLIED>
>
> ]>
>
>         <OBJECTS>
>         <CUSTOMER
>                  __objName="CUSTOMER__"
>                  Orders="ORDER__1 ORDER__2 "
>                  Id="3"
>                  FirstName="Tina"
>                  LastName="Koslovsky"
>                  CompanyName="Irgendwo GmbH&oKG"/>
>         <ORDER
>                  __objName="ORDER__1"
>                  Id="6"
>                  ArticleName="Fruchtquark"
>                  Amount="45"/>
>         <ORDER
>                  __objName="ORDER__2"
>                  Id="7"
>                  ArticleName="Romadour"
>                  Amount="20"/>
>         </OBJECTS>
>
> --
> #-------------------------------------------------------------------------
> # Phone: 703-925-1487(w)                # 703-729-4463(h)
> # email: mark.everline@level8.com (w)   # meverline@loudoun.com (h)
> #-------------------------------------------------------------------------
> #
> # Any day you get to breath underwater is a good day.
> #                  - A Scuba Divers motto
> #
> #-------------------------------------------------------------------------
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: xerces-c-dev-unsubscribe@xml.apache.org
> For additional commands, e-mail: xerces-c-dev-help@xml.apache.org