You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@xalan.apache.org by Scott Boag/CAM/Lotus <Sc...@lotus.com> on 2000/12/13 05:07:14 UTC
Need code review of writeAttrURI

I'm trying to get the URL encoding that the XSLT spec dictates to be as
correct as it can be.  The following code, which I will also go ahead and
check in, checks the UTF-16 value to see if it should be encoded (see
comments on the problems with this... basically, as far as I understand,
there's no perfect way to do this), and, if it needs encoding, encodes from
UTF-16 to UTF-8, including surrogate pairs.  If there are folks on this
list who have the applicable knowledge, I would much appreciate a code
review...

The accum function obviously just writes to the output buffer.

  /**
   * Return true if the character is the high member of a surrogate pair.
   */
  static final boolean isUTF16Surrogate(char c)
  {
    return (c & 0xFC00) == 0xD800;
  }

  /**
   * Tell if a character is an ASCII digit.
   */
   private boolean isASCIIDigit(char c)
   {
      return (c >= '0' && c <= '9');
   }

  /**
   * Write the specified <var>string</var> after substituting non ASCII
characters,
   * with <CODE>%HH</CODE>, where HH is the hex of the byte value.
   *
   * @param   string      String to convert to XML format.
   * @param doURLEscaping True if we should try to encode as
   *                      per http://www.ietf.org/rfc/rfc2396.txt.
   * @see #backReference
   *
   * @throws org.xml.sax.SAXException if a bad surrogate pair is detected.
   */
  public void writeAttrURI(String string, boolean doURLEscaping)
          throws org.xml.sax.SAXException
  {
    // http://www.ietf.org/rfc/rfc2396.txt says:
    // A URI is always in an "escaped" form, since escaping or unescaping a
    // completed URI might change its semantics.  Normally, the only time
    // escape encodings can safely be made is when the URI is being created
    // from its component parts; each component may have its own set of
    // characters that are reserved, so only the mechanism responsible for
    // generating or interpreting that component can determine whether or
    // not escaping a character will change its semantics. Likewise, a URI
    // must be separated into its components before the escaped characters
    // within those components can be safely decoded.
    //
    // ...So we do our best to do limited escaping of the URL, without
    // causing damage.  If the URL is already properly escaped, in theory,
this
    // function should not change the string value.

    char[] stringArray = string.toCharArray();
    int len = stringArray.length;

    for (int i = 0; i < len; i++)
    {
      char ch = stringArray[i];

      if ((ch < 33) || (ch > 126))
      {
        if (doURLEscaping)
        {
          // Encode UTF16 to UTF8.
          // Reference is Unicode, A Primer, by Tony Graham.
          // Page 92.

          if(ch <= 0x7F)
          {
            accum('%');
            accum(Integer.toHexString(ch).toUpperCase());
          }
          else if(ch <= 0x7FF)
          {
            // Clear low 6 bits before rotate, put high 4 bits in low byte,
            // and set two high bits.
            int high = (ch >> 6) | 0xC0;
            int low = (ch & 0x3F) | 0x80; // First 6 bits, + high bit
            accum('%');
            accum(Integer.toHexString(high).toUpperCase());
            accum('%');
            accum(Integer.toHexString(low).toUpperCase());
          }
          else if( isUTF16Surrogate(ch) ) // high surrogate
          {
            // I'm sure this can be done in 3 instructions, but I choose
            // to try and do it exactly like it is done in the book, at
least
            // until we are sure this is totally clean.  I don't think
performance
            // is a big issue with this particular function, though I could
be
            // wrong.  Also, the stuff below clearly does more masking than
            // it needs to do.

            // Clear high 6 bits.
            int highSurrogate = ((int) ch) & 0x03FF;

            // Middle 4 bits (wwww) + 1
            // "Note that the value of wwww from the high surrogate bit
pattern
            // is incremented to make the uuuuu bit pattern in the scalar
value
            // so the surrogate pair don't address the BMP."
            int wwww = ((highSurrogate & 0x03C0) >> 6);
            int uuuuu = wwww+1;

            // next 4 bits
            int zzzz = (highSurrogate & 0x003C) >> 2;

            // low 2 bits
            int yyyyyy = ((highSurrogate & 0x0003) << 4) & 0x30;

            // Get low surrogate character.
            ch = stringArray[++i];

            // Clear high 6 bits.
            int lowSurrogate = ((int) ch) & 0x03FF;

            // put the middle 4 bits into the bottom of yyyyyy (byte 3)
            yyyyyy = yyyyyy | ((lowSurrogate & 0x03C0) >> 6);

            // bottom 6 bits.
            int xxxxxx = (lowSurrogate & 0x003F);

            int byte1 = 0xF0 | (uuuuu >> 2); // top 3 bits of uuuuu
            int byte2 = 0x80 | (((uuuuu & 0x03) << 4) & 0x30) | zzzz;
            int byte3 = 0x80 | yyyyyy;
            int byte4 = 0x80 | xxxxxx;

            accum('%');
            accum(Integer.toHexString(byte1).toUpperCase());
            accum('%');
            accum(Integer.toHexString(byte2).toUpperCase());
            accum('%');
            accum(Integer.toHexString(byte3).toUpperCase());
            accum('%');
            accum(Integer.toHexString(byte4).toUpperCase());
          }
          else
          {
            int high = (ch >> 12) | 0xE0; // top 4 bits
            int middle = ((ch & 0x0FC0) >> 6) | 0x80; // middle 6 bits
            int low = (ch & 0x3F) | 0x80; // First 6 bits, + high bit
            accum('%');
            accum(Integer.toHexString(high).toUpperCase());
            accum('%');
            accum(Integer.toHexString(middle).toUpperCase());
            accum('%');
            accum(Integer.toHexString(low).toUpperCase());
          }

        }
        else if (ch < m_maxCharacter)
        {
          accum(ch);
        }
        else
        {
          accum("&#");
          accum(Integer.toString(ch));
          accum(';');
        }
      }
      else if('%' == ch)
      {
        // If the character is a '%' number number, try to avoid
double-escaping.
        // There is a question if this is legal behavior.
        if(((i+2) < len) && isASCIIDigit(stringArray[i+1])
            && isASCIIDigit(stringArray[i+2]))
        {
         accum(ch);
        }
        else
        {
          if (doURLEscaping)
          {
           accum('%');
           accum(Integer.toHexString(ch).toUpperCase());
          }
          else
            accum(ch);
        }

      }
      // Since http://www.ietf.org/rfc/rfc2396.txt refers to the URI
grammar as
      // not allowing quotes in the URI proper syntax, nor in the fragment
      // identifier, we believe that it's OK to double escape quotes.
      else if (ch == '"')
      {
        // Mike Kay encodes this as &#34;, so he may know something I
don't?
        if (doURLEscaping)
          accum("%22");
        else
          accum("&quot;"); // we have to escape this, I guess.
      }
      else
      {
        accum(ch);
      }
    }

  }