You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by je...@locus.apache.org on 2000/11/09 22:51:36 UTC

cvs commit: xml-xerces/java/src/org/apache/xerces/readers UTF8Reader.java

jeffreyr    00/11/09 13:51:36

  Modified:    java/src/org/apache/xerces/readers UTF8Reader.java
  Log:
  Fixed hang if we have the case where we find a multibyte UTF8 character outside the [#x10000-#x10FFFF]
  valid range. Even tough we check to see if the value was a valid UTF8 we failed to check if the
  value was within the allowed range.
  The loop was on the XMLDocumentScanner dispatch method, in the SCANNER_STATE_CONTENT case, after
  calling the scanContent method from the from the entity Reader we got a CONTENT_RESULT_INVALID_CHAR
  state because we did a preliminary check to se if the value was outside the [#x10000-#x10FFFF]
  later on the case of CONTENT_RESULT_INVALID_CHAR  we call the Entity Reader lookingAtValidChar. This
  check will return a valid (true) and then it will restore the scanner state an go into an infinite
  loop within the dispatch method.
  The solution was check at the lookingAtValidChar the same range test [#x10000-#x10FFFF] so we
  return a value of false and bailout of the loop.
  
  The first check at scanContent should have been to check for UTF8 character validation and then
  the lookingAtValidChar check should be just for XML UTF8 character allowed range.
  
  So we may want to revisit UTF8Reader.
  
  The testcase used to verify this problem is:
  
  xmltest/not-wf/sa/170.xml:
  
  <doc>abcd</doc>
  where a=0xf7
        b=0x80
        c=0x80
        d=0x80
  
  Revision  Changes    Path
  1.11      +12 -10    xml-xerces/java/src/org/apache/xerces/readers/UTF8Reader.java
  
  Index: UTF8Reader.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/readers/UTF8Reader.java,v
  retrieving revision 1.10
  retrieving revision 1.11
  diff -u -r1.10 -r1.11
  --- UTF8Reader.java	2000/09/19 15:55:53	1.10
  +++ UTF8Reader.java	2000/11/09 21:51:36	1.11
  @@ -88,7 +88,7 @@
    * the SUN java runtime compiler (JIT) and the code here has been
    * carefully "crafted" to avoid those problems.
    * 
  - * @version $Id: UTF8Reader.java,v 1.10 2000/09/19 15:55:53 jeffreyr Exp $
  + * @version $Id: UTF8Reader.java,v 1.11 2000/11/09 21:51:36 jeffreyr Exp $
    */
   final class UTF8Reader extends XMLEntityReader {
       //
  @@ -427,16 +427,18 @@
           boolean result = false;
   
           //if (( 0xf8 & b0 ) == 0xf0 ) {
  -            //if (!(b0 > 0xF4 || (b0 == 0xF4 && b1 >= 0x90))) { // [#x10000-#x10FFFF]
  -        if( ((b0&0xf8) == 0xf0) && ((b1&0xc0)==0x80) &&
  -            ((b2&0xc0) == 0x80) && ((b3&0xc0)==0x80)){
  -            if (skipPastChar) {
  +        //if (!(b0 > 0xF4 || (b0 == 0xF4 && b1 >= 0x90))) { // [#x10000-#x10FFFF]
  +        if ( ((b0&0xf8) == 0xf0) && ((b1&0xc0)==0x80) &&
  +             ((b2&0xc0) == 0x80) && ((b3&0xc0)==0x80)){
  +            if (!(b0 > 0xF4 || (b0 == 0xF4 && b1 >= 0x90))) { // [#x10000-#x10FFFF]
  +
  +                if (skipPastChar) {
                       fCharacterCounter++;
                       loadNextByte();
                       return true;
  +                }
  +                result = true;
               }
  -            result = true;
  -            //}
               fCurrentChunk = saveChunk;
               fCurrentIndex = saveIndex;
               fCurrentOffset = saveOffset;
  @@ -807,14 +809,14 @@
               int b2 = 0;
               if ((0xe0 & b0) == 0xc0) { // 110yyyyy 10xxxxxx
                   ch = ((0x1f & b0)<<6) + (0x3f & b1);
  -            } else if( (0xf0 & b0) == 0xe0 ) { 
  +            } else if ( (0xf0 & b0) == 0xe0 ) { 
                   b2 = loadNextByte();
                   ch = ((0x0f & b0)<<12) + ((0x3f & b1)<<6) + (0x3f & b2);
  -            } else if(( 0xf8 & b0 ) == 0xf0 ){
  +            } else if (( 0xf8 & b0 ) == 0xf0 ){
                   b2 = loadNextByte();
                   int b3 = loadNextByte(); // 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
                   ch = ((0x0f & b0)<<18) + ((0x3f & b1)<<12)
  -                         + ((0x3f & b2)<<6) + (0x3f & b3);
  +                     + ((0x3f & b2)<<6) + (0x3f & b3);
               }
           }
           loadNextByte();