You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by pe...@apache.org on 2004/06/30 21:03:26 UTC

cvs commit: xml-xerces/c/src/xercesc/util XMLUTF8Transcoder.cpp

peiyongz    2004/06/30 12:03:26

  Modified:    c/src/xercesc/util XMLUTF8Transcoder.cpp
  Log:
  XML1.0-3rd Edition: UTF_8
  
  Revision  Changes    Path
  1.9       +39 -26    xml-xerces/c/src/xercesc/util/XMLUTF8Transcoder.cpp
  
  Index: XMLUTF8Transcoder.cpp
  ===================================================================
  RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/XMLUTF8Transcoder.cpp,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- XMLUTF8Transcoder.cpp	19 May 2004 20:51:20 -0000	1.8
  +++ XMLUTF8Transcoder.cpp	30 Jun 2004 19:03:26 -0000	1.9
  @@ -106,7 +106,7 @@
       ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
       ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
       ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  -    ,   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
  +    ,   0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
       ,   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
       ,   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
       ,   3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
  @@ -262,24 +262,21 @@
            *
            ***/
   
  +        XMLUInt32 tmpVal = 0;
  +
           switch(trailingBytes)
           {
               case 1 :
                   // UTF-8:   [110y yyyy] [10xx xxxx]
                   // Unicode: [0000 0yyy] [yyxx xxxx]
                   //
  -                if (*srcPtr < 0xC2) 
  -                {
  -                    char byte[2] = {*srcPtr,0};
  -
  -                    ThrowXMLwithMemMgr1(UTFDataFormatException
  -                                      , XMLExcepts::UTF8_Invalid_2BytesSeq
  -                                      , byte
  -                                      , getMemoryManager());
  -                }
  -              
  +                // 0xC0, 0xC1 has been filtered out             
                   checkTrailingBytes(*(srcPtr+1), 1, 1);
   
  +                tmpVal = *srcPtr++;
  +                tmpVal <<= 6;
  +                tmpVal += *srcPtr++;
  +
                   break;
               case 2 :
                   // UTF-8:   [1110 zzzz] [10yy yyyy] [10xx xxxx]
  @@ -300,6 +297,19 @@
                   checkTrailingBytes(*(srcPtr+1), 2, 1);
                   checkTrailingBytes(*(srcPtr+2), 2, 2);
   
  +                //
  +                // D36 (a) UTF-8 is the Unicode Transformation Format that serializes 
  +                //         a Unicode code point as a sequence of one to four bytes, 
  +                //         as specified in Table 3.1, UTF-8 Bit Distribution.
  +                //     (b) An illegal UTF-8 code unit sequence is any byte sequence that 
  +                //         does not match the patterns listed in Table 3.1B, Legal UTF-8 
  +                //         Byte Sequences.
  +                //     (c) An irregular UTF-8 code unit sequence is a six-byte sequence 
  +                //         where the first three bytes correspond to a high surrogate, 
  +                //         and the next three bytes correspond to a low surrogate. 
  +                //         As a consequence of C12, these irregular UTF-8 sequences shall 
  +                //         not be generated by a conformant process. 
  +                //
                   //irregular three bytes sequence
                   // that is zzzzyy matches leading surrogate tag 110110 or 
                   //                       trailing surrogate tag 110111
  @@ -322,6 +332,12 @@
                                 , getMemoryManager());
                   }
   
  +                tmpVal = *srcPtr++;
  +                tmpVal <<= 6;
  +                tmpVal += *srcPtr++;
  +                tmpVal <<= 6;
  +                tmpVal += *srcPtr++;
  +
                   break;
               case 3 : 
                   // UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
  @@ -342,10 +358,18 @@
                                         , getMemoryManager());
                   }
   
  -                checkTrailingBytes(*(srcPtr+1), 2, 1);
  -                checkTrailingBytes(*(srcPtr+2), 2, 1);
  -                checkTrailingBytes(*(srcPtr+3), 2, 1);
  +                checkTrailingBytes(*(srcPtr+1), 3, 1);
  +                checkTrailingBytes(*(srcPtr+2), 3, 2);
  +                checkTrailingBytes(*(srcPtr+3), 3, 3);
                   
  +                tmpVal = *srcPtr++;
  +                tmpVal <<= 6;
  +                tmpVal += *srcPtr++;
  +                tmpVal <<= 6;
  +                tmpVal += *srcPtr++;
  +                tmpVal <<= 6;
  +                tmpVal += *srcPtr++;
  +
                   break;
               default: // trailingBytes > 3
   
  @@ -369,17 +393,6 @@
                   break;
           }
   
  -        // All bytes have been verified, need not to check any more
  -
  -        XMLUInt32 tmpVal = *srcPtr++;
  -        tmpVal <<= 6;
  -        for(unsigned int i=1; i<trailingBytes; i++) 
  -        {
  -            tmpVal += *srcPtr++; 
  -            tmpVal <<= 6;
  -        }
  -
  -        tmpVal += *srcPtr++;
   
           // since trailingBytes comes from an array, this logic is redundant
           //  default :
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: xerces-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: xerces-cvs-help@xml.apache.org