You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by pe...@apache.org on 2004/06/30 21:03:26 UTC
cvs commit: xml-xerces/c/src/xercesc/util XMLUTF8Transcoder.cpp
peiyongz 2004/06/30 12:03:26
Modified: c/src/xercesc/util XMLUTF8Transcoder.cpp
Log:
XML1.0-3rd Edition: UTF_8
Revision Changes Path
1.9 +39 -26 xml-xerces/c/src/xercesc/util/XMLUTF8Transcoder.cpp
Index: XMLUTF8Transcoder.cpp
===================================================================
RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/XMLUTF8Transcoder.cpp,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9
--- XMLUTF8Transcoder.cpp 19 May 2004 20:51:20 -0000 1.8
+++ XMLUTF8Transcoder.cpp 30 Jun 2004 19:03:26 -0000 1.9
@@ -106,7 +106,7 @@
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+ , 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
@@ -262,24 +262,21 @@
*
***/
+ XMLUInt32 tmpVal = 0;
+
switch(trailingBytes)
{
case 1 :
// UTF-8: [110y yyyy] [10xx xxxx]
// Unicode: [0000 0yyy] [yyxx xxxx]
//
- if (*srcPtr < 0xC2)
- {
- char byte[2] = {*srcPtr,0};
-
- ThrowXMLwithMemMgr1(UTFDataFormatException
- , XMLExcepts::UTF8_Invalid_2BytesSeq
- , byte
- , getMemoryManager());
- }
-
+ // 0xC0, 0xC1 has been filtered out
checkTrailingBytes(*(srcPtr+1), 1, 1);
+ tmpVal = *srcPtr++;
+ tmpVal <<= 6;
+ tmpVal += *srcPtr++;
+
break;
case 2 :
// UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
@@ -300,6 +297,19 @@
checkTrailingBytes(*(srcPtr+1), 2, 1);
checkTrailingBytes(*(srcPtr+2), 2, 2);
+ //
+ // D36 (a) UTF-8 is the Unicode Transformation Format that serializes
+ // a Unicode code point as a sequence of one to four bytes,
+ // as specified in Table 3.1, UTF-8 Bit Distribution.
+ // (b) An illegal UTF-8 code unit sequence is any byte sequence that
+ // does not match the patterns listed in Table 3.1B, Legal UTF-8
+ // Byte Sequences.
+ // (c) An irregular UTF-8 code unit sequence is a six-byte sequence
+ // where the first three bytes correspond to a high surrogate,
+ // and the next three bytes correspond to a low surrogate.
+ // As a consequence of C12, these irregular UTF-8 sequences shall
+ // not be generated by a conformant process.
+ //
//irregular three bytes sequence
// that is zzzzyy matches leading surrogate tag 110110 or
// trailing surrogate tag 110111
@@ -322,6 +332,12 @@
, getMemoryManager());
}
+ tmpVal = *srcPtr++;
+ tmpVal <<= 6;
+ tmpVal += *srcPtr++;
+ tmpVal <<= 6;
+ tmpVal += *srcPtr++;
+
break;
case 3 :
// UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
@@ -342,10 +358,18 @@
, getMemoryManager());
}
- checkTrailingBytes(*(srcPtr+1), 2, 1);
- checkTrailingBytes(*(srcPtr+2), 2, 1);
- checkTrailingBytes(*(srcPtr+3), 2, 1);
+ checkTrailingBytes(*(srcPtr+1), 3, 1);
+ checkTrailingBytes(*(srcPtr+2), 3, 2);
+ checkTrailingBytes(*(srcPtr+3), 3, 3);
+ tmpVal = *srcPtr++;
+ tmpVal <<= 6;
+ tmpVal += *srcPtr++;
+ tmpVal <<= 6;
+ tmpVal += *srcPtr++;
+ tmpVal <<= 6;
+ tmpVal += *srcPtr++;
+
break;
default: // trailingBytes > 3
@@ -369,17 +393,6 @@
break;
}
- // All bytes have been verified, need not to check any more
-
- XMLUInt32 tmpVal = *srcPtr++;
- tmpVal <<= 6;
- for(unsigned int i=1; i<trailingBytes; i++)
- {
- tmpVal += *srcPtr++;
- tmpVal <<= 6;
- }
-
- tmpVal += *srcPtr++;
// since trailingBytes comes from an array, this logic is redundant
// default :
---------------------------------------------------------------------
To unsubscribe, e-mail: xerces-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: xerces-cvs-help@xml.apache.org