You are viewing a plain text version of this content. The canonical link for it is here.
Posted to c-dev@xerces.apache.org by "kirby zhou (JIRA)" <xe...@xml.apache.org> on 2010/09/06 16:12:35 UTC

[jira] Issue Comment Edited: (XERCESC-1936) ICUTransService and IconvGNUransService CAN NOT deal with huge file.

    [ https://issues.apache.org/jira/browse/XERCESC-1936?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12894980#action_12894980 ] 

kirby zhou edited comment on XERCESC-1936 at 9/6/10 10:12 AM:
--------------------------------------------------------------

The following 2 lines are more suitable for UTF-8 locale users to debug.

]# ( echo '<?xml version="1.0" encoding="GBK" ?>'; echo '<data>'; for ((i=0;i<2;++i)); do echo -en '\xd6\xd0\xce\xc4\xba\xba\xd7\xd6A'; done ; echo; echo '</data>' ) >  /small.xml

]# ( echo '<?xml version="1.0" encoding="GBK" ?>'; echo '<data>'; for ((i=0;i<100000;++i)); do echo -en '\xd6\xd0\xce\xc4\xba\xba\xd7\xd6A'; done ; echo; echo '</data>' ) > ~/big.xml 


diff -x .svn -x CVS -ru --show-c-function xerces-c-3.1.1.bak/src/xercesc/util/Transcoders/IconvGNU/IconvGNUTransService.cpp xerces-c-3.1.1/src/xercesc/util/Transcoders/IconvGNU/IconvGNUTransService.cpp
--- xerces-c-3.1.1.bak/src/xercesc/util/Transcoders/IconvGNU/IconvGNUTransService.cpp   2010-01-20 16:45:02.000000000 +0800
+++ xerces-c-3.1.1/src/xercesc/util/Transcoders/IconvGNU/IconvGNUTransService.cpp       2010-08-04 02:07:06.000000000 +0800
@@ -1049,6 +1049,9 @@ XMLSize_t    IconvGNUTranscoder::transco
     for (size_t cnt = 0; cnt < maxChars && srcLen; cnt++) {
         size_t    rc = iconvFrom(startSrc, &srcLen, &orgTarget, uChSize());
         if (rc == (size_t)-1) {
+            if (errno == EINVAL) {
+                break;
+            }
             if (errno != E2BIG || prevSrcLen == srcLen) {
                 ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq, getMemoryManager());
             }
diff -x .svn -x CVS -ru --show-c-function xerces-c-3.1.1.bak/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp xerces-c-3.1.1/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp
--- xerces-c-3.1.1.bak/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp     2010-01-20 16:45:02.000000000 +0800
+++ xerces-c-3.1.1/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp 2010-08-04 02:28:46.000000000 +0800
@@ -666,7 +666,7 @@ ICUTranscoder::transcodeTo( const   XMLC
     );
 
     // Rememember the status before we possibly overite the error code
-    const bool res = (err == U_ZERO_ERROR);
+    const bool res = (err == U_ZERO_ERROR || (err == U_BUFFER_OVERFLOW_ERROR && startSrc > srcPtr));
 
     // Put the old handler back
     err = U_ZERO_ERROR;


      was (Author: kirbyzhou):
    The following 2 lines are more suitable for UTF-8 locale users to debug.

]# ( echo '<?xml version="1.0" encoding="GBK" ?>'; echo '<data>'; for ((i=0;i<2;++i)); do echo -en '\xd6\xd0\xce\xc4\xba\xba\xd7\xd6A'; done ; echo; echo '</data>' ) >  /small.xml

]# ( echo '<?xml version="1.0" encoding="GBK" ?>'; echo '<data>'; for ((i=0;i<100000;++i)); do echo -en '\xd6\xd0\xce\xc4\xba\xba\xd7\xd6A'; done ; echo; echo '</data>' ) > ~/big.xml 


diff -x .svn -x CVS -ru --show-c-function xerces-c-3.1.1.bak/src/xercesc/util/Transcoders/IconvGNU/IconvGNUTransService.cpp xerces-c-3.1.1/src/xercesc/util/Transcoders/IconvGNU/IconvGNUTransService.cpp
--- xerces-c-3.1.1.bak/src/xercesc/util/Transcoders/IconvGNU/IconvGNUTransService.cpp   2010-01-20 16:45:02.000000000 +0800
+++ xerces-c-3.1.1/src/xercesc/util/Transcoders/IconvGNU/IconvGNUTransService.cpp       2010-08-04 02:07:06.000000000 +0800
@@ -1049,6 +1049,9 @@ XMLSize_t    IconvGNUTranscoder::transco
     for (size_t cnt = 0; cnt < maxChars && srcLen; cnt++) {
         size_t    rc = iconvFrom(startSrc, &srcLen, &orgTarget, uChSize());
         if (rc == (size_t)-1) {
+            if (errno == EINVAL) {
+                break;
+            }
             if (errno != E2BIG || prevSrcLen == srcLen) {
                 ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq, getMemoryManager());
             }
diff -x .svn -x CVS -ru --show-c-function xerces-c-3.1.1.bak/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp xerces-c-3.1.1/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp
--- xerces-c-3.1.1.bak/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp     2010-01-20 16:45:02.000000000 +0800
+++ xerces-c-3.1.1/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp 2010-08-04 02:28:46.000000000 +0800
@@ -666,7 +666,7 @@ ICUTranscoder::transcodeTo( const   XMLC
     );
 
     // Rememember the status before we possibly overite the error code
-    const bool res = (err == U_ZERO_ERROR);
+    const bool res = (err == U_ZERO_ERROR || (err == U_BUFFER_OVERFLOW_ERROR && startSrc > srcPtr));
 
     // Put the old handler back
     err = U_ZERO_ERROR;
[


  
> ICUTransService and IconvGNUransService CAN NOT deal with huge file.
> --------------------------------------------------------------------
>
>                 Key: XERCESC-1936
>                 URL: https://issues.apache.org/jira/browse/XERCESC-1936
>             Project: Xerces-C++
>          Issue Type: Bug
>          Components: Utilities
>    Affects Versions: 2.8.0, 3.1.1
>         Environment: RHEL-5.5
> glibc-2.5-49.el5_5.2
> libicu-3.6-5.11.4
>            Reporter: kirby zhou
>
> If a huge file passed to XMLReader, it will call TransService mulitple times, and splite the file content into several fragments.
> Unfortunately, the fragment will contain incomplete multi-byte characters.
> But neither ICUTransService nor IconvGNUransService deal with it. ICUTransService did not deal with U_TRUNCATED_CHAR_FOUND, and IconvGNUransService did not deal with EINVAL.
> Both 2.8.0 and 3.1.1 have the same bug.
> For example, make 2 XML like that:
> ]# ( echo '<?xml version="1.0" encoding="GBK" ?>'; echo '<data>'; for ((i=0;i<2;++i)); do echo -n '中文汉字A'; done ; echo; echo '</data>' ) > ~/small.xml
> ]# ( echo '<?xml version="1.0" encoding="GBK" ?>'; echo '<data>'; for ((i=0;i<100000;++i)); do echo -n '中文汉字A'; done ; echo; echo '</data>' ) > ~/big.xml
> # the small.xml and big.xml are analogical. 
> ]# samples/SAXPrint -x=gbk ~/small.xml 
> <?xml version="1.0" encoding="gbk"?>
> <data>
> 中文汉字A中文汉字A
> </data>
> # with icu
> ]# samples/SAXPrint -x=gbk ~/big.xml
> <?xml version="1.0" encoding="gbk"?>
> <data>
> Fatal Error at file /root/big.xml, line 3, char 16377
>   Message: char 0x6C49 is not representable in 'gbk' encoding
> # with iconvgnu
> ]# samples/SAXPrint -x=gbk ~/big.xml
> ]# samples/SAXPrint -x=gbk ~/big.xml 
> <?xml version="1.0" encoding="gbk"?>
> <data>
> Fatal Error at file /root/big.xml, line 3, char 16377
>   Message: invalid multi-byte sequence

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


---------------------------------------------------------------------
To unsubscribe, e-mail: c-dev-unsubscribe@xerces.apache.org
For additional commands, e-mail: c-dev-help@xerces.apache.org