You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by pe...@apache.org on 2004/06/30 21:04:17 UTC

cvs commit: xml-xerces/c/src/xercesc/util/Transcoders/Uniconv390 XMLUTF8Transcoder390.cpp XMLUTF8Transcoder390.hpp

peiyongz    2004/06/30 12:04:17

  Modified:    c/src/xercesc/util/Transcoders/Uniconv390
                        XMLUTF8Transcoder390.cpp XMLUTF8Transcoder390.hpp
  Log:
  XML1.0-3rd Edition: UTF_8
  
  Revision  Changes    Path
  1.4       +177 -31   xml-xerces/c/src/xercesc/util/Transcoders/Uniconv390/XMLUTF8Transcoder390.cpp
  
  Index: XMLUTF8Transcoder390.cpp
  ===================================================================
  RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/Transcoders/Uniconv390/XMLUTF8Transcoder390.cpp,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- XMLUTF8Transcoder390.cpp	6 Feb 2004 18:23:55 -0000	1.3
  +++ XMLUTF8Transcoder390.cpp	30 Jun 2004 19:04:17 -0000	1.4
  @@ -56,6 +56,9 @@
   
   /*
    * $Log$
  + * Revision 1.4  2004/06/30 19:04:17  peiyongz
  + * XML1.0-3rd Edition: UTF_8
  + *
    * Revision 1.3  2004/02/06 18:23:55  cargilld
    * Misc 390 changes.
    *
  @@ -164,7 +167,7 @@
       ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
       ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
       ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  -    ,   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
  +    ,   0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
       ,   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
       ,   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
       ,   3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
  @@ -291,33 +294,180 @@
               ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len, getMemoryManager());
           }
   
  -        XMLUInt32 tmpVal = *srcPtr++;
  -        tmpVal <<= 6;
  -        for(unsigned int i=1; i<trailingBytes; i++) 
  +        /***
  +         * http://www.unicode.org/reports/tr27/
  +         *
  +         * Table 3.1B. lists all of the byte sequences that are legal in UTF-8. 
  +         * A range of byte values such as A0..BF indicates that any byte from A0 to BF (inclusive) 
  +         * is legal in that position. 
  +         * Any byte value outside of the ranges listed is illegal. 
  +         * For example, 
  +         * the byte sequence <C0 AF> is illegal  since C0 is not legal in the 1st Byte column. 
  +         * The byte sequence <E0 9F 80> is illegal since in the row 
  +         *    where E0 is legal as a first byte, 
  +         *    9F is not legal as a second byte. 
  +         * The byte sequence <F4 80 83 92> is legal, since every byte in that sequence matches 
  +         * a byte range in a row of the table (the last row). 
  +         *
  +         *
  +         * Table 3.1B. Legal UTF-8 Byte Sequences  
  +         * Code Points              1st Byte    2nd Byte    3rd Byte    4th Byte 
  +         * =========================================================================
  +         * U+0000..U+007F            00..7F       
  +         * -------------------------------------------------------------------------
  +         * U+0080..U+07FF            C2..DF      80..BF      
  +         *
  +         * -------------------------------------------------------------------------
  +         * U+0800..U+0FFF            E0          A0..BF     80..BF   
  +         *                                       -- 
  +         *                          
  +         * U+1000..U+FFFF            E1..EF      80..BF     80..BF    
  +         *
  +         * --------------------------------------------------------------------------
  +         * U+10000..U+3FFFF          F0          90..BF     80..BF       80..BF 
  +         *                                       --
  +         * U+40000..U+FFFFF          F1..F3      80..BF     80..BF       80..BF 
  +         * U+100000..U+10FFFF        F4          80..8F     80..BF       80..BF 
  +         *                                           --
  +         * ==========================================================================
  +         *
  +         *  Cases where a trailing byte range is not 80..BF are underlined in the table to 
  +         *  draw attention to them. These occur only in the second byte of a sequence.
  +         *
  +         ***/
  +
  +        XMLUInt32 tmpVal = 0;
  +
  +        switch(trailingBytes)
           {
  -            if((*srcPtr & 0xC0) == 0x80) 
  -            {
  -                tmpVal += *srcPtr++; 
  +            case 1 :
  +                // UTF-8:   [110y yyyy] [10xx xxxx]
  +                // Unicode: [0000 0yyy] [yyxx xxxx]
  +                //
  +                // 0xC0, 0xC1 has been filtered out             
  +                checkTrailingBytes(*(srcPtr+1), 1, 1);
  +
  +                tmpVal = *srcPtr++;
  +                tmpVal <<= 6;
  +                tmpVal += *srcPtr++;
  +
  +                break;
  +            case 2 :
  +                // UTF-8:   [1110 zzzz] [10yy yyyy] [10xx xxxx]
  +                // Unicode: [zzzz yyyy] [yyxx xxxx]
  +                //
  +                if (( *srcPtr == 0xE0) && ( *(srcPtr+1) < 0xA0)) 
  +                {
  +                    char byte0[2] = {*srcPtr    ,0};
  +                    char byte1[2] = {*(srcPtr+1),0};
  +
  +                    ThrowXMLwithMemMgr2(UTFDataFormatException
  +                                      , XMLExcepts::UTF8_Invalid_3BytesSeq
  +                                      , byte0
  +                                      , byte1
  +                                      , getMemoryManager());
  +                }
  +
  +                checkTrailingBytes(*(srcPtr+1), 2, 1);
  +                checkTrailingBytes(*(srcPtr+2), 2, 2);
  +
  +                //
  +                // D36 (a) UTF-8 is the Unicode Transformation Format that serializes 
  +                //         a Unicode code point as a sequence of one to four bytes, 
  +                //         as specified in Table 3.1, UTF-8 Bit Distribution.
  +                //     (b) An illegal UTF-8 code unit sequence is any byte sequence that 
  +                //         does not match the patterns listed in Table 3.1B, Legal UTF-8 
  +                //         Byte Sequences.
  +                //     (c) An irregular UTF-8 code unit sequence is a six-byte sequence 
  +                //         where the first three bytes correspond to a high surrogate, 
  +                //         and the next three bytes correspond to a low surrogate. 
  +                //         As a consequence of C12, these irregular UTF-8 sequences shall 
  +                //         not be generated by a conformant process. 
  +                //
  +                //irregular three bytes sequence
  +                // that is zzzzyy matches leading surrogate tag 110110 or 
  +                //                       trailing surrogate tag 110111
  +                // *srcPtr=1110 1101 
  +                // *(srcPtr+1)=1010 yyyy or 
  +                // *(srcPtr+1)=1011 yyyy
  +                //
  +                // 0xED 1110 1101
  +                // 0xA0 1010 0000
  +
  +                if ((*srcPtr == 0xED) && (*(srcPtr+1) >= 0xA0))
  +                {
  +                    char byte0[2] = {*srcPtr,    0};
  +                    char byte1[2] = {*(srcPtr+1),0};
  +
  +                     ThrowXMLwithMemMgr2(UTFDataFormatException
  +                              , XMLExcepts::UTF8_Irregular_3BytesSeq
  +                              , byte0
  +                              , byte1
  +                              , getMemoryManager());
  +                }
  +
  +                tmpVal = *srcPtr++;
  +                tmpVal <<= 6;
  +                tmpVal += *srcPtr++;
  +                tmpVal <<= 6;
  +                tmpVal += *srcPtr++;
  +
  +                break;
  +            case 3 : 
  +                // UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
  +                // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
  +                //          [1101 11yy] [yyxx xxxx] (low surrogate)
  +                //          * uuuuu = wwww + 1
  +                //
  +                if (((*srcPtr == 0xF0) && (*(srcPtr+1) < 0x90)) ||
  +                    ((*srcPtr == 0xF4) && (*(srcPtr+1) > 0x8F))  )
  +                {
  +                    char byte0[2] = {*srcPtr    ,0};
  +                    char byte1[2] = {*(srcPtr+1),0};
  +
  +                    ThrowXMLwithMemMgr2(UTFDataFormatException
  +                                      , XMLExcepts::UTF8_Invalid_4BytesSeq
  +                                      , byte0
  +                                      , byte1
  +                                      , getMemoryManager());
  +                }
  +
  +                checkTrailingBytes(*(srcPtr+1), 3, 1);
  +                checkTrailingBytes(*(srcPtr+2), 3, 2);
  +                checkTrailingBytes(*(srcPtr+3), 3, 3);
  +                
  +                tmpVal = *srcPtr++;
  +                tmpVal <<= 6;
  +                tmpVal += *srcPtr++;
  +                tmpVal <<= 6;
  +                tmpVal += *srcPtr++;
                   tmpVal <<= 6;
  -            } 
  -            else
  -            {
  -                char len[2] = {(char)(trailingBytes+0x31), 0};
  -                char pos[2]= {(char)(i+0x31), 0};
  +                tmpVal += *srcPtr++;
  +
  +                break;
  +            default: // trailingBytes > 3
  +
  +                /***
  +                 * The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also allows 
  +                 * for the use of five- and six-byte sequences to encode characters that 
  +                 * are outside the range of the Unicode character set; those five- and 
  +                 * six-byte sequences are illegal for the use of UTF-8 as a transformation 
  +                 * of Unicode characters. ISO/IEC 10646 does not allow mapping of unpaired 
  +                 * surrogates, nor U+FFFE and U+FFFF (but it does allow other noncharacters).
  +                 ***/
  +                char len[2]  = {(char)(trailingBytes+0x31), 0};
                   char byte[2] = {*srcPtr,0};
  -                ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len, getMemoryManager());
  -            }
  -        }
  -        if((*srcPtr & 0xC0) == 0x80) 
  -        {
  -            tmpVal += *srcPtr++;
  -        }
  -        else 
  -        {
  -            char len[2] = {(char)(trailingBytes+0x31), 0};
  -            char byte[2] = {*srcPtr,0};
  -            ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, len, byte, len, getMemoryManager());
  +
  +                ThrowXMLwithMemMgr2(UTFDataFormatException
  +                                  , XMLExcepts::UTF8_Exceede_BytesLimit
  +                                  , byte
  +                                  , len
  +                                  , getMemoryManager());
  +
  +                break;
           }
  +
  +
           // since trailingBytes comes from an array, this logic is redundant
           //  default :
           //      ThrowXML(TranscodingException, XMLExcepts::Trans_BadSrcSeq);
  @@ -371,7 +521,7 @@
               //  one to be zero.
               //
               *sizePtr++ = 0;
  -            *outPtr++ = XMLCh(tmpVal & 0x3FF) + 0xDC00;
  +            *outPtr++ = XMLCh((tmpVal & 0x3FF) + 0xDC00);
           }
       }
   
  @@ -438,12 +588,8 @@
               encodedBytes = 2;
           else if (curVal < 0x10000)
               encodedBytes = 3;
  -        else if (curVal < 0x200000)
  +        else if (curVal < 0x110000)
               encodedBytes = 4;
  -        else if (curVal < 0x4000000)
  -            encodedBytes = 5;
  -        else if (curVal <= 0x7FFFFFFF)
  -            encodedBytes = 6;
           else
           {
               // If the options say to throw, then throw
  
  
  
  1.4       +28 -0     xml-xerces/c/src/xercesc/util/Transcoders/Uniconv390/XMLUTF8Transcoder390.hpp
  
  Index: XMLUTF8Transcoder390.hpp
  ===================================================================
  RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/Transcoders/Uniconv390/XMLUTF8Transcoder390.hpp,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- XMLUTF8Transcoder390.hpp	22 Apr 2004 22:46:46 -0000	1.3
  +++ XMLUTF8Transcoder390.hpp	30 Jun 2004 19:04:17 -0000	1.4
  @@ -56,6 +56,9 @@
   
   /*
    * $Log$
  + * Revision 1.4  2004/06/30 19:04:17  peiyongz
  + * XML1.0-3rd Edition: UTF_8
  + *
    * Revision 1.3  2004/04/22 22:46:46  neilg
    * not all 390 processors support the new transcoding instructions; this patch makes Xerces work there as well.  Thanks to Steve Dulin
    *
  @@ -69,6 +72,7 @@
   
   #include <xercesc/util/XercesDefs.hpp>
   #include <xercesc/util/TransService.hpp>
  +#include <xercesc/util/UTFDataFormatException.hpp>
   
   XERCES_CPP_NAMESPACE_BEGIN
   
  @@ -126,12 +130,36 @@
   
   
   private :
  +
  +    inline void checkTrailingBytes(
  +                                    const XMLByte      toCheck
  +                                  , const unsigned int trailingBytes
  +                                  , const unsigned int position       
  +                                  ) const;
  +
  +private :
       // -----------------------------------------------------------------------
       //  Unimplemented constructors and operators
       // -----------------------------------------------------------------------
       XMLUTF8Transcoder390(const XMLUTF8Transcoder390&);
       XMLUTF8Transcoder390& operator=(const XMLUTF8Transcoder390&);
   };
  +
  +inline 
  +void XMLUTF8Transcoder390::checkTrailingBytes(const XMLByte      toCheck
  +                                            , const unsigned int trailingBytes
  +                                            , const unsigned int position) const
  +{
  +
  +    if((toCheck & 0xC0) != 0x80) 
  +    {
  +        char len[2]  = {(char)(trailingBytes+0x31), 0};
  +        char pos[2]  = {(char)(position+0x31), 0};
  +        char byte[2] = {toCheck,0};
  +        ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len, getMemoryManager());
  +    }
  +
  +}
   
   XERCES_CPP_NAMESPACE_END
   
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: xerces-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: xerces-cvs-help@xml.apache.org