You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by pe...@apache.org on 2004/06/30 21:04:17 UTC
cvs commit: xml-xerces/c/src/xercesc/util/Transcoders/Uniconv390 XMLUTF8Transcoder390.cpp XMLUTF8Transcoder390.hpp
peiyongz 2004/06/30 12:04:17
Modified: c/src/xercesc/util/Transcoders/Uniconv390
XMLUTF8Transcoder390.cpp XMLUTF8Transcoder390.hpp
Log:
XML1.0-3rd Edition: UTF_8
Revision Changes Path
1.4 +177 -31 xml-xerces/c/src/xercesc/util/Transcoders/Uniconv390/XMLUTF8Transcoder390.cpp
Index: XMLUTF8Transcoder390.cpp
===================================================================
RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/Transcoders/Uniconv390/XMLUTF8Transcoder390.cpp,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- XMLUTF8Transcoder390.cpp 6 Feb 2004 18:23:55 -0000 1.3
+++ XMLUTF8Transcoder390.cpp 30 Jun 2004 19:04:17 -0000 1.4
@@ -56,6 +56,9 @@
/*
* $Log$
+ * Revision 1.4 2004/06/30 19:04:17 peiyongz
+ * XML1.0-3rd Edition: UTF_8
+ *
* Revision 1.3 2004/02/06 18:23:55 cargilld
* Misc 390 changes.
*
@@ -164,7 +167,7 @@
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+ , 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
@@ -291,33 +294,180 @@
ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len, getMemoryManager());
}
- XMLUInt32 tmpVal = *srcPtr++;
- tmpVal <<= 6;
- for(unsigned int i=1; i<trailingBytes; i++)
+ /***
+ * http://www.unicode.org/reports/tr27/
+ *
+ * Table 3.1B. lists all of the byte sequences that are legal in UTF-8.
+ * A range of byte values such as A0..BF indicates that any byte from A0 to BF (inclusive)
+ * is legal in that position.
+ * Any byte value outside of the ranges listed is illegal.
+ * For example,
+ * the byte sequence <C0 AF> is illegal since C0 is not legal in the 1st Byte column.
+ * The byte sequence <E0 9F 80> is illegal since in the row
+ * where E0 is legal as a first byte,
+ * 9F is not legal as a second byte.
+ * The byte sequence <F4 80 83 92> is legal, since every byte in that sequence matches
+ * a byte range in a row of the table (the last row).
+ *
+ *
+ * Table 3.1B. Legal UTF-8 Byte Sequences
+ * Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
+ * =========================================================================
+ * U+0000..U+007F 00..7F
+ * -------------------------------------------------------------------------
+ * U+0080..U+07FF C2..DF 80..BF
+ *
+ * -------------------------------------------------------------------------
+ * U+0800..U+0FFF E0 A0..BF 80..BF
+ * --
+ *
+ * U+1000..U+FFFF E1..EF 80..BF 80..BF
+ *
+ * --------------------------------------------------------------------------
+ * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
+ * --
+ * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
+ * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
+ * --
+ * ==========================================================================
+ *
+ * Cases where a trailing byte range is not 80..BF are underlined in the table to
+ * draw attention to them. These occur only in the second byte of a sequence.
+ *
+ ***/
+
+ XMLUInt32 tmpVal = 0;
+
+ switch(trailingBytes)
{
- if((*srcPtr & 0xC0) == 0x80)
- {
- tmpVal += *srcPtr++;
+ case 1 :
+ // UTF-8: [110y yyyy] [10xx xxxx]
+ // Unicode: [0000 0yyy] [yyxx xxxx]
+ //
+ // 0xC0, 0xC1 has been filtered out
+ checkTrailingBytes(*(srcPtr+1), 1, 1);
+
+ tmpVal = *srcPtr++;
+ tmpVal <<= 6;
+ tmpVal += *srcPtr++;
+
+ break;
+ case 2 :
+ // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
+ // Unicode: [zzzz yyyy] [yyxx xxxx]
+ //
+ if (( *srcPtr == 0xE0) && ( *(srcPtr+1) < 0xA0))
+ {
+ char byte0[2] = {*srcPtr ,0};
+ char byte1[2] = {*(srcPtr+1),0};
+
+ ThrowXMLwithMemMgr2(UTFDataFormatException
+ , XMLExcepts::UTF8_Invalid_3BytesSeq
+ , byte0
+ , byte1
+ , getMemoryManager());
+ }
+
+ checkTrailingBytes(*(srcPtr+1), 2, 1);
+ checkTrailingBytes(*(srcPtr+2), 2, 2);
+
+ //
+ // D36 (a) UTF-8 is the Unicode Transformation Format that serializes
+ // a Unicode code point as a sequence of one to four bytes,
+ // as specified in Table 3.1, UTF-8 Bit Distribution.
+ // (b) An illegal UTF-8 code unit sequence is any byte sequence that
+ // does not match the patterns listed in Table 3.1B, Legal UTF-8
+ // Byte Sequences.
+ // (c) An irregular UTF-8 code unit sequence is a six-byte sequence
+ // where the first three bytes correspond to a high surrogate,
+ // and the next three bytes correspond to a low surrogate.
+ // As a consequence of C12, these irregular UTF-8 sequences shall
+ // not be generated by a conformant process.
+ //
+ //irregular three bytes sequence
+ // that is zzzzyy matches leading surrogate tag 110110 or
+ // trailing surrogate tag 110111
+ // *srcPtr=1110 1101
+ // *(srcPtr+1)=1010 yyyy or
+ // *(srcPtr+1)=1011 yyyy
+ //
+ // 0xED 1110 1101
+ // 0xA0 1010 0000
+
+ if ((*srcPtr == 0xED) && (*(srcPtr+1) >= 0xA0))
+ {
+ char byte0[2] = {*srcPtr, 0};
+ char byte1[2] = {*(srcPtr+1),0};
+
+ ThrowXMLwithMemMgr2(UTFDataFormatException
+ , XMLExcepts::UTF8_Irregular_3BytesSeq
+ , byte0
+ , byte1
+ , getMemoryManager());
+ }
+
+ tmpVal = *srcPtr++;
+ tmpVal <<= 6;
+ tmpVal += *srcPtr++;
+ tmpVal <<= 6;
+ tmpVal += *srcPtr++;
+
+ break;
+ case 3 :
+ // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
+ // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
+ // [1101 11yy] [yyxx xxxx] (low surrogate)
+ // * uuuuu = wwww + 1
+ //
+ if (((*srcPtr == 0xF0) && (*(srcPtr+1) < 0x90)) ||
+ ((*srcPtr == 0xF4) && (*(srcPtr+1) > 0x8F)) )
+ {
+ char byte0[2] = {*srcPtr ,0};
+ char byte1[2] = {*(srcPtr+1),0};
+
+ ThrowXMLwithMemMgr2(UTFDataFormatException
+ , XMLExcepts::UTF8_Invalid_4BytesSeq
+ , byte0
+ , byte1
+ , getMemoryManager());
+ }
+
+ checkTrailingBytes(*(srcPtr+1), 3, 1);
+ checkTrailingBytes(*(srcPtr+2), 3, 2);
+ checkTrailingBytes(*(srcPtr+3), 3, 3);
+
+ tmpVal = *srcPtr++;
+ tmpVal <<= 6;
+ tmpVal += *srcPtr++;
+ tmpVal <<= 6;
+ tmpVal += *srcPtr++;
tmpVal <<= 6;
- }
- else
- {
- char len[2] = {(char)(trailingBytes+0x31), 0};
- char pos[2]= {(char)(i+0x31), 0};
+ tmpVal += *srcPtr++;
+
+ break;
+ default: // trailingBytes > 3
+
+ /***
+ * The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also allows
+ * for the use of five- and six-byte sequences to encode characters that
+ * are outside the range of the Unicode character set; those five- and
+ * six-byte sequences are illegal for the use of UTF-8 as a transformation
+ * of Unicode characters. ISO/IEC 10646 does not allow mapping of unpaired
+ * surrogates, nor U+FFFE and U+FFFF (but it does allow other noncharacters).
+ ***/
+ char len[2] = {(char)(trailingBytes+0x31), 0};
char byte[2] = {*srcPtr,0};
- ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len, getMemoryManager());
- }
- }
- if((*srcPtr & 0xC0) == 0x80)
- {
- tmpVal += *srcPtr++;
- }
- else
- {
- char len[2] = {(char)(trailingBytes+0x31), 0};
- char byte[2] = {*srcPtr,0};
- ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, len, byte, len, getMemoryManager());
+
+ ThrowXMLwithMemMgr2(UTFDataFormatException
+ , XMLExcepts::UTF8_Exceede_BytesLimit
+ , byte
+ , len
+ , getMemoryManager());
+
+ break;
}
+
+
// since trailingBytes comes from an array, this logic is redundant
// default :
// ThrowXML(TranscodingException, XMLExcepts::Trans_BadSrcSeq);
@@ -371,7 +521,7 @@
// one to be zero.
//
*sizePtr++ = 0;
- *outPtr++ = XMLCh(tmpVal & 0x3FF) + 0xDC00;
+ *outPtr++ = XMLCh((tmpVal & 0x3FF) + 0xDC00);
}
}
@@ -438,12 +588,8 @@
encodedBytes = 2;
else if (curVal < 0x10000)
encodedBytes = 3;
- else if (curVal < 0x200000)
+ else if (curVal < 0x110000)
encodedBytes = 4;
- else if (curVal < 0x4000000)
- encodedBytes = 5;
- else if (curVal <= 0x7FFFFFFF)
- encodedBytes = 6;
else
{
// If the options say to throw, then throw
1.4 +28 -0 xml-xerces/c/src/xercesc/util/Transcoders/Uniconv390/XMLUTF8Transcoder390.hpp
Index: XMLUTF8Transcoder390.hpp
===================================================================
RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/Transcoders/Uniconv390/XMLUTF8Transcoder390.hpp,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- XMLUTF8Transcoder390.hpp 22 Apr 2004 22:46:46 -0000 1.3
+++ XMLUTF8Transcoder390.hpp 30 Jun 2004 19:04:17 -0000 1.4
@@ -56,6 +56,9 @@
/*
* $Log$
+ * Revision 1.4 2004/06/30 19:04:17 peiyongz
+ * XML1.0-3rd Edition: UTF_8
+ *
* Revision 1.3 2004/04/22 22:46:46 neilg
* not all 390 processors support the new transcoding instructions; this patch makes Xerces work there as well. Thanks to Steve Dulin
*
@@ -69,6 +72,7 @@
#include <xercesc/util/XercesDefs.hpp>
#include <xercesc/util/TransService.hpp>
+#include <xercesc/util/UTFDataFormatException.hpp>
XERCES_CPP_NAMESPACE_BEGIN
@@ -126,12 +130,36 @@
private :
+
+ inline void checkTrailingBytes(
+ const XMLByte toCheck
+ , const unsigned int trailingBytes
+ , const unsigned int position
+ ) const;
+
+private :
// -----------------------------------------------------------------------
// Unimplemented constructors and operators
// -----------------------------------------------------------------------
XMLUTF8Transcoder390(const XMLUTF8Transcoder390&);
XMLUTF8Transcoder390& operator=(const XMLUTF8Transcoder390&);
};
+
+inline
+void XMLUTF8Transcoder390::checkTrailingBytes(const XMLByte toCheck
+ , const unsigned int trailingBytes
+ , const unsigned int position) const
+{
+
+ if((toCheck & 0xC0) != 0x80)
+ {
+ char len[2] = {(char)(trailingBytes+0x31), 0};
+ char pos[2] = {(char)(position+0x31), 0};
+ char byte[2] = {toCheck,0};
+ ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len, getMemoryManager());
+ }
+
+}
XERCES_CPP_NAMESPACE_END
---------------------------------------------------------------------
To unsubscribe, e-mail: xerces-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: xerces-cvs-help@xml.apache.org