You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by pe...@apache.org on 2004/05/19 22:51:20 UTC
cvs commit: xml-xerces/c/src/xercesc/util TransService.hpp XMLUTF8Transcoder.cpp XMLUTF8Transcoder.hpp

peiyongz    2004/05/19 13:51:20

  Modified:    c/src/xercesc/util TransService.hpp XMLUTF8Transcoder.cpp
                        XMLUTF8Transcoder.hpp
  Log:
  XML1.0-3rd Edition: UTF_8
  
  Revision  Changes    Path
  1.13      +4 -1      xml-xerces/c/src/xercesc/util/TransService.hpp
  
  Index: TransService.hpp
  ===================================================================
  RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/TransService.hpp,v
  retrieving revision 1.12
  retrieving revision 1.13
  diff -u -r1.12 -r1.13
  --- TransService.hpp	29 Jan 2004 11:48:46 -0000	1.12
  +++ TransService.hpp	19 May 2004 20:51:20 -0000	1.13
  @@ -56,6 +56,9 @@
   
   /*
    * $Log$
  + * Revision 1.13  2004/05/19 20:51:20  peiyongz
  + * XML1.0-3rd Edition: UTF_8
  + *
    * Revision 1.12  2004/01/29 11:48:46  cargilld
    * Code cleanup changes to get rid of various compiler diagnostic messages.
    *
  @@ -384,7 +387,7 @@
       /** Converts from the encoding of the service to the internal XMLCh* encoding
         *
         * @param srcData the source buffer to be transcoded
  -      * @param srcCount number of characters in the source buffer
  +      * @param srcCount number of bytes in the source buffer
         * @param toFill the destination buffer
         * @param maxChars the max number of characters in the destination buffer
         * @param bytesEaten after transcoding, this will hold the number of bytes
  
  
  
  1.8       +160 -33   xml-xerces/c/src/xercesc/util/XMLUTF8Transcoder.cpp
  
  Index: XMLUTF8Transcoder.cpp
  ===================================================================
  RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/XMLUTF8Transcoder.cpp,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- XMLUTF8Transcoder.cpp	29 Jan 2004 11:48:47 -0000	1.7
  +++ XMLUTF8Transcoder.cpp	19 May 2004 20:51:20 -0000	1.8
  @@ -65,8 +65,6 @@
   #include <xercesc/util/XMLString.hpp>
   #include <xercesc/util/XMLUniDefs.hpp>
   #include <xercesc/util/XMLUTF8Transcoder.hpp>
  -#include <xercesc/util/UTFDataFormatException.hpp>
  -
   
   XERCES_CPP_NAMESPACE_BEGIN
   
  @@ -140,9 +138,8 @@
   // ---------------------------------------------------------------------------
   XMLUTF8Transcoder::XMLUTF8Transcoder(const  XMLCh* const    encodingName
                                       , const unsigned int    blockSize
  -                                    , MemoryManager* const  manager) :
  -
  -    XMLTranscoder(encodingName, blockSize, manager)
  +                                    , MemoryManager* const  manager)
  +:XMLTranscoder(encodingName, blockSize, manager)
   {
   }
   
  @@ -223,33 +220,167 @@
               ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len, getMemoryManager());
           }
   
  -        XMLUInt32 tmpVal = *srcPtr++;
  -        tmpVal <<= 6;
  -        for(unsigned int i=1; i<trailingBytes; i++) 
  +        /***
  +         * http://www.unicode.org/reports/tr27/
  +         *
  +         * Table 3.1B. lists all of the byte sequences that are legal in UTF-8. 
  +         * A range of byte values such as A0..BF indicates that any byte from A0 to BF (inclusive) 
  +         * is legal in that position. 
  +         * Any byte value outside of the ranges listed is illegal. 
  +         * For example, 
  +         * the byte sequence <C0 AF> is illegal  since C0 is not legal in the 1st Byte column. 
  +         * The byte sequence <E0 9F 80> is illegal since in the row 
  +         *    where E0 is legal as a first byte, 
  +         *    9F is not legal as a second byte. 
  +         * The byte sequence <F4 80 83 92> is legal, since every byte in that sequence matches 
  +         * a byte range in a row of the table (the last row). 
  +         *
  +         *
  +         * Table 3.1B. Legal UTF-8 Byte Sequences  
  +         * Code Points              1st Byte    2nd Byte    3rd Byte    4th Byte 
  +         * =========================================================================
  +         * U+0000..U+007F            00..7F       
  +         * -------------------------------------------------------------------------
  +         * U+0080..U+07FF            C2..DF      80..BF      
  +         *
  +         * -------------------------------------------------------------------------
  +         * U+0800..U+0FFF            E0          A0..BF     80..BF   
  +         *                                       -- 
  +         *                          
  +         * U+1000..U+FFFF            E1..EF      80..BF     80..BF    
  +         *
  +         * --------------------------------------------------------------------------
  +         * U+10000..U+3FFFF          F0          90..BF     80..BF       80..BF 
  +         *                                       --
  +         * U+40000..U+FFFFF          F1..F3      80..BF     80..BF       80..BF 
  +         * U+100000..U+10FFFF        F4          80..8F     80..BF       80..BF 
  +         *                                           --
  +         * ==========================================================================
  +         *
  +         *  Cases where a trailing byte range is not 80..BF are underlined in the table to 
  +         *  draw attention to them. These occur only in the second byte of a sequence.
  +         *
  +         ***/
  +
  +        switch(trailingBytes)
           {
  -            if((*srcPtr & 0xC0) == 0x80) 
  -            {
  -                tmpVal += *srcPtr++; 
  -                tmpVal <<= 6;
  -            } 
  -            else
  -            {
  -                char len[2] = {(char)(trailingBytes+0x31), 0};
  -                char pos[2]= {(char)(i+0x31), 0};
  +            case 1 :
  +                // UTF-8:   [110y yyyy] [10xx xxxx]
  +                // Unicode: [0000 0yyy] [yyxx xxxx]
  +                //
  +                if (*srcPtr < 0xC2) 
  +                {
  +                    char byte[2] = {*srcPtr,0};
  +
  +                    ThrowXMLwithMemMgr1(UTFDataFormatException
  +                                      , XMLExcepts::UTF8_Invalid_2BytesSeq
  +                                      , byte
  +                                      , getMemoryManager());
  +                }
  +              
  +                checkTrailingBytes(*(srcPtr+1), 1, 1);
  +
  +                break;
  +            case 2 :
  +                // UTF-8:   [1110 zzzz] [10yy yyyy] [10xx xxxx]
  +                // Unicode: [zzzz yyyy] [yyxx xxxx]
  +                //
  +                if (( *srcPtr == 0xE0) && ( *(srcPtr+1) < 0xA0)) 
  +                {
  +                    char byte0[2] = {*srcPtr    ,0};
  +                    char byte1[2] = {*(srcPtr+1),0};
  +
  +                    ThrowXMLwithMemMgr2(UTFDataFormatException
  +                                      , XMLExcepts::UTF8_Invalid_3BytesSeq
  +                                      , byte0
  +                                      , byte1
  +                                      , getMemoryManager());
  +                }
  +
  +                checkTrailingBytes(*(srcPtr+1), 2, 1);
  +                checkTrailingBytes(*(srcPtr+2), 2, 2);
  +
  +                //irregular three bytes sequence
  +                // that is zzzzyy matches leading surrogate tag 110110 or 
  +                //                       trailing surrogate tag 110111
  +                // *srcPtr=1110 1101 
  +                // *(srcPtr+1)=1010 yyyy or 
  +                // *(srcPtr+1)=1011 yyyy
  +                //
  +                // 0xED 1110 1101
  +                // 0xA0 1010 0000
  +
  +                if ((*srcPtr == 0xED) && (*(srcPtr+1) >= 0xA0))
  +                {
  +                    char byte0[2] = {*srcPtr,    0};
  +                    char byte1[2] = {*(srcPtr+1),0};
  +
  +                     ThrowXMLwithMemMgr2(UTFDataFormatException
  +                              , XMLExcepts::UTF8_Irregular_3BytesSeq
  +                              , byte0
  +                              , byte1
  +                              , getMemoryManager());
  +                }
  +
  +                break;
  +            case 3 : 
  +                // UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
  +                // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
  +                //          [1101 11yy] [yyxx xxxx] (low surrogate)
  +                //          * uuuuu = wwww + 1
  +                //
  +                if (((*srcPtr == 0xF0) && (*(srcPtr+1) < 0x90)) ||
  +                    ((*srcPtr == 0xF4) && (*(srcPtr+1) > 0x8F))  )
  +                {
  +                    char byte0[2] = {*srcPtr    ,0};
  +                    char byte1[2] = {*(srcPtr+1),0};
  +
  +                    ThrowXMLwithMemMgr2(UTFDataFormatException
  +                                      , XMLExcepts::UTF8_Invalid_4BytesSeq
  +                                      , byte0
  +                                      , byte1
  +                                      , getMemoryManager());
  +                }
  +
  +                checkTrailingBytes(*(srcPtr+1), 2, 1);
  +                checkTrailingBytes(*(srcPtr+2), 2, 1);
  +                checkTrailingBytes(*(srcPtr+3), 2, 1);
  +                
  +                break;
  +            default: // trailingBytes > 3
  +
  +                /***
  +                 * The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also allows 
  +                 * for the use of five- and six-byte sequences to encode characters that 
  +                 * are outside the range of the Unicode character set; those five- and 
  +                 * six-byte sequences are illegal for the use of UTF-8 as a transformation 
  +                 * of Unicode characters. ISO/IEC 10646 does not allow mapping of unpaired 
  +                 * surrogates, nor U+FFFE and U+FFFF (but it does allow other noncharacters).
  +                 ***/
  +                char len[2]  = {(char)(trailingBytes+0x31), 0};
                   char byte[2] = {*srcPtr,0};
  -                ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len, getMemoryManager());
  -            }
  -        }
  -        if((*srcPtr & 0xC0) == 0x80) 
  -        {
  -            tmpVal += *srcPtr++;
  +
  +                ThrowXMLwithMemMgr2(UTFDataFormatException
  +                                  , XMLExcepts::UTF8_Exceede_BytesLimit
  +                                  , byte
  +                                  , len
  +                                  , getMemoryManager());
  +
  +                break;
           }
  -        else 
  +
  +        // All bytes have been verified, need not to check any more
  +
  +        XMLUInt32 tmpVal = *srcPtr++;
  +        tmpVal <<= 6;
  +        for(unsigned int i=1; i<trailingBytes; i++) 
           {
  -            char len[2] = {(char)(trailingBytes+0x31), 0};
  -            char byte[2] = {*srcPtr,0};
  -            ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, len, byte, len, getMemoryManager());
  +            tmpVal += *srcPtr++; 
  +            tmpVal <<= 6;
           }
  +
  +        tmpVal += *srcPtr++;
  +
           // since trailingBytes comes from an array, this logic is redundant
           //  default :
           //      ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq);
  @@ -370,12 +501,8 @@
               encodedBytes = 2;
           else if (curVal < 0x10000)
               encodedBytes = 3;
  -        else if (curVal < 0x200000)
  +        else if (curVal < 0x110000)
               encodedBytes = 4;
  -        else if (curVal < 0x4000000)
  -            encodedBytes = 5;
  -        else if (curVal <= 0x7FFFFFFF)
  -            encodedBytes = 6;
           else
           {
               // If the options say to throw, then throw
  
  
  
  1.5       +25 -0     xml-xerces/c/src/xercesc/util/XMLUTF8Transcoder.hpp
  
  Index: XMLUTF8Transcoder.hpp
  ===================================================================
  RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/XMLUTF8Transcoder.hpp,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- XMLUTF8Transcoder.hpp	17 Dec 2003 00:18:35 -0000	1.4
  +++ XMLUTF8Transcoder.hpp	19 May 2004 20:51:20 -0000	1.5
  @@ -59,6 +59,7 @@
   
   #include <xercesc/util/XercesDefs.hpp>
   #include <xercesc/util/TransService.hpp>
  +#include <xercesc/util/UTFDataFormatException.hpp>
   
   XERCES_CPP_NAMESPACE_BEGIN
   
  @@ -116,12 +117,36 @@
   
   
   private :
  +
  +    inline void checkTrailingBytes(
  +                                    const XMLByte      toCheck
  +                                  , const unsigned int trailingBytes
  +                                  , const unsigned int position       
  +                                  ) const;
  +
  +private :
       // -----------------------------------------------------------------------
       //  Unimplemented constructors and operators
       // -----------------------------------------------------------------------
       XMLUTF8Transcoder(const XMLUTF8Transcoder&);
       XMLUTF8Transcoder& operator=(const XMLUTF8Transcoder&);
   };
  +
  +inline 
  +void XMLUTF8Transcoder::checkTrailingBytes(const XMLByte      toCheck
  +                                          , const unsigned int trailingBytes
  +                                          , const unsigned int position) const
  +{
  +
  +    if((toCheck & 0xC0) != 0x80) 
  +    {
  +        char len[2]  = {(char)(trailingBytes+0x31), 0};
  +        char pos[2]  = {(char)(position+0x31), 0};
  +        char byte[2] = {toCheck,0};
  +        ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len, getMemoryManager());
  +    }
  +
  +}
   
   XERCES_CPP_NAMESPACE_END
   
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: xerces-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: xerces-cvs-help@xml.apache.org