You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xalan.apache.org by mi...@apache.org on 2005/08/03 21:20:32 UTC
cvs commit: xml-xalan/java/src/org/apache/xml/serializer ToTextStream.java Encodings.properties ToHTMLStream.java ToStream.java EncodingInfo.java ToXMLStream.java Encodings.java

minchau     2005/08/03 12:20:32

  Modified:    java/src/org/apache/xml/serializer ToTextStream.java
                        Encodings.properties ToHTMLStream.java
                        ToStream.java EncodingInfo.java ToXMLStream.java
                        Encodings.java
  Log:
  Committing patch for XALANJ-2087
  This is a fix for correctly deciding if a character is in an encoding or not,
  so we write it as is, or as an entity.
  
  The old code had a concept of a
  maximum character in the encoding, and assumed that all characters less
  than that unicode value were in the encoding.  New code uses the
  underlying Java libarary, but caches results for performance.
  
  Patch was reviewed/approved by Yash Talwar (see XALANJ-2087)
  
  Revision  Changes    Path
  1.22      +83 -150   xml-xalan/java/src/org/apache/xml/serializer/ToTextStream.java
  
  Index: ToTextStream.java
  ===================================================================
  RCS file: /home/cvs/xml-xalan/java/src/org/apache/xml/serializer/ToTextStream.java,v
  retrieving revision 1.21
  retrieving revision 1.22
  diff -u -r1.21 -r1.22
  --- ToTextStream.java	7 Apr 2005 04:29:03 -0000	1.21
  +++ ToTextStream.java	3 Aug 2005 19:20:31 -0000	1.22
  @@ -214,7 +214,7 @@
           }
           else {
               // In final output state we do process the characters!
  -            writeNormalizedChars(ch, start, length, false, m_lineSepUse);
  +            writeNormalizedChars(ch, start, length, m_lineSepUse);
           }
               
           if (m_tracer != null)
  @@ -243,7 +243,7 @@
   
       try
       {
  -      writeNormalizedChars(ch, start, length, false, m_lineSepUse);
  +      writeNormalizedChars(ch, start, length, m_lineSepUse);
       }
       catch(IOException ioe)
       {
  @@ -251,168 +251,101 @@
       }
     }
     
  -/**
  - * Normalize the characters, but don't escape.  Different from 
  - * SerializerToXML#writeNormalizedChars because it does not attempt to do 
  - * XML escaping at all.
  - *
  - * @param ch The characters from the XML document.
  - * @param start The start position in the array.
  - * @param length The number of characters to read from the array.
  - * @param isCData true if a CDATA block should be built around the characters.
  - * @param useLineSep true if the operating systems 
  - * end-of-line separator should be output rather than a new-line character.
  - * 
  - * @throws IOException
  - * @throws org.xml.sax.SAXException
  - */
  -void writeNormalizedChars(
  -    final char ch[],
  -    final int start,
  -    final int length,
  -    final boolean isCData,
  -    final boolean useLineSep)
  -    throws IOException, org.xml.sax.SAXException
  -{
  -    final java.io.Writer writer = m_writer;
  -    final int end = start + length;
  -
  -    /* copy a few "constants" before the loop for performance */
  -    final char S_LINEFEED = CharInfo.S_LINEFEED;
  -    final int M_MAXCHARACTER = this.m_maxCharacter;
  +    /**
  +     * Normalize the characters, but don't escape.  Different from 
  +     * SerializerToXML#writeNormalizedChars because it does not attempt to do 
  +     * XML escaping at all.
  +     *
  +     * @param ch The characters from the XML document.
  +     * @param start The start position in the array.
  +     * @param length The number of characters to read from the array.
  +     * @param useLineSep true if the operating systems 
  +     * end-of-line separator should be output rather than a new-line character.
  +     * 
  +     * @throws IOException
  +     * @throws org.xml.sax.SAXException
  +     */
  +    void writeNormalizedChars(
  +        final char ch[],
  +            final int start,
  +            final int length,
  +            final boolean useLineSep)
  +            throws IOException, org.xml.sax.SAXException 
  +    {
  +        final String encoding = getEncoding();
  +        final java.io.Writer writer = m_writer;
  +        final int end = start + length;
  +
  +        /* copy a few "constants" before the loop for performance */
  +        final char S_LINEFEED = CharInfo.S_LINEFEED;
   
  -    if (isCData)
  -    {
           // This for() loop always increments i by one at the end
           // of the loop.  Additional increments of i adjust for when
  -        // two input characters are processed.
  -        for (int i = start; i < end; i++)
  -        {
  +        // two input characters (a high/low UTF16 surrogate pair)
  +        // are processed.
  +        for (int i = start; i < end; i++) {
               final char c = ch[i];
   
  -            if (S_LINEFEED == c && useLineSep)
  -            {
  +            if (S_LINEFEED == c && useLineSep) {
                   writer.write(m_lineSep, 0, m_lineSepLen);
  -            }
  -            else if (c > M_MAXCHARACTER)
  -            {
  -                if (i != 0)
  -                    closeCDATA();
  -
  -                // This needs to go into a function...
  -                if (isUTF16Surrogate(c))
  -                {
  -                    writeUTF16Surrogate(c, ch, i, end);
  -                    i++; // two input characters processed
  -                }
  -                else
  -                {
  -                    writer.write(c);
  -                }
  -
  -                if ((i != 0) && (i < (end - 1)))
  -                {
  -                    writer.write(CDATA_DELIMITER_OPEN);
  -                    m_cdataTagOpen = true;
  -                }
  -            }
  -            else if (
  -                ((i < (end - 2))
  -                    && (']' == c)
  -                    && (']' == ch[i + 1])
  -                    && ('>' == ch[i + 2])))
  -            {
  -                writer.write(CDATA_CONTINUE);
  -                i += 2;
  -            }
  -            else
  -            {
  -                if (c <= M_MAXCHARACTER)
  -                {
  -                    writer.write(c);
  -                }
  +                // one input char processed
  +            } else if (m_encodingInfo.isInEncoding(c)) {
  +                writer.write(c);
  +                // one input char processed    
  +            } else if (Encodings.isHighUTF16Surrogate(c)) {
  +                final int codePoint = writeUTF16Surrogate(c, ch, i, end);
  +                if (codePoint != 0) {
  +                    // I think we can just emit the message,
  +                    // not crash and burn.
  +                    final String integralValue = Integer.toString(codePoint);
  +                    final String msg = Utils.messages.createMessage(
  +                        MsgKey.ER_ILLEGAL_CHARACTER,
  +                        new Object[] { integralValue, encoding });
  +                      
  +                    //Older behavior was to throw the message,
  +                    //but newer gentler behavior is to write a message to System.err
  +                    //throw new SAXException(msg);
  +                    System.err.println(msg);                            
   
  -                else if (isUTF16Surrogate(c))
  -                {
  -                    writeUTF16Surrogate(c, ch, i, end);
  -                    i++; // two input characters processed
  -                }
  -                else
  -                {
  -                    /* The character is greater than the allowed 
  -                     * maximum value and it is not part of a UTF-16
  -                     * pair that would be put out as a character reference.
  -                     */
  -                    String encoding = getEncoding();
  -                    if (encoding != null)
  -                    {
  -                        /* The output encoding is known, 
  -                         * so somthing is wrong.
  -                         */ 
  -                        String integralValue = Integer.toString(c);
  -                        throw new SAXException(Utils.messages.createMessage(
  -                            MsgKey.ER_ILLEGAL_CHARACTER,
  -                            new Object[]{ integralValue, encoding}));
  -                    }
  -                    else 
  -                    {
  -                        /* The output encoding is not known,
  -                         * so just write it out as-is.
  -                         */                        
  -                        writer.write(c);
  -                    }
                   }
  -            }
  -        }
  -    }
  -    else
  -    {
  -        // not in CDATA section
  -        for (int i = start; i < end; i++)
  -        {
  -            final char c = ch[i];
  -
  -            if (S_LINEFEED == c && useLineSep)
  -            {
  -                writer.write(m_lineSep, 0, m_lineSepLen);
  -            }
  -            else if (c <= M_MAXCHARACTER)
  -            {
  -                writer.write(c);
  -            }
  -            else if (isUTF16Surrogate(c))
  -            {
  -                writeUTF16Surrogate(c, ch, i, end);
  -                i++; // two input characters processed
  -            }
  -            else
  -            {
  -                /* The character is greater than the allowed 
  -                 * maximum value and it is not part of a UTF-16
  -                 * pair that would be put out as a character reference.
  -                 */
  -                String encoding = getEncoding();
  -                if (encoding != null) 
  -                {
  +                i++; // two input chars processed               
  +            } else {
  +                // Don't know what to do with this char, it is
  +                // not in the encoding and not a high char in
  +                // a surrogate pair, so write out as an entity ref
  +                if (encoding != null) {
                       /* The output encoding is known, 
                        * so somthing is wrong.
  -                     */ 
  -                    String integralValue = Integer.toString(c);
  -                    throw new SAXException(Utils.messages.createMessage(
  +                     */
  +
  +                    // not in the encoding, so write out a character reference
  +                    writer.write('&');
  +                    writer.write('#');
  +                    writer.write(Integer.toString(c));
  +                    writer.write(';');
  +
  +                    // I think we can just emit the message,
  +                    // not crash and burn.
  +                    final String integralValue = Integer.toString(c);
  +                    final String msg = Utils.messages.createMessage(
                           MsgKey.ER_ILLEGAL_CHARACTER,
  -                        new Object[]{ integralValue, encoding}));
  -                }
  -                else 
  -                {
  +                        new Object[] { integralValue, encoding });
  +                      
  +                    //Older behavior was to throw the message,
  +                    //but newer gentler behavior is to write a message to System.err
  +                    //throw new SAXException(msg);
  +                    System.err.println(msg); 
  +                } else {
                       /* The output encoding is not known,
                        * so just write it out as-is.
  -                     */                        
  +                     */
                       writer.write(c);
  -                }                
  +                }
  +
  +                // one input char was processed
               }
           }
       }
  -}
   
     /**
      * Receive notification of cdata.
  @@ -444,7 +377,7 @@
     {
       try
       {
  -        writeNormalizedChars(ch, start, length, false, m_lineSepUse);
  +        writeNormalizedChars(ch, start, length, m_lineSepUse);
           if (m_tracer != null)
               super.fireCDATAEvent(ch, start, length);              
       }
  @@ -486,7 +419,7 @@
   
       try
       {
  -      writeNormalizedChars(ch, start, length, false, m_lineSepUse);
  +      writeNormalizedChars(ch, start, length, m_lineSepUse);
       }
       catch(IOException ioe)
       {
  
  
  
  1.7       +13 -1     xml-xalan/java/src/org/apache/xml/serializer/Encodings.properties
  
  Index: Encodings.properties
  ===================================================================
  RCS file: /home/cvs/xml-xalan/java/src/org/apache/xml/serializer/Encodings.properties,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- Encodings.properties	8 Mar 2004 18:49:10 -0000	1.6
  +++ Encodings.properties	3 Aug 2005 19:20:31 -0000	1.7
  @@ -42,6 +42,18 @@
   ISO8859-8 ISO-8859-8 0x00FF
   ISO8859_9 ISO-8859-9 0x00FF
   ISO8859-9 ISO-8859-9 0x00FF
  +ISO8859_10 ISO-8859-10 0x00FF
  +ISO8859-10 ISO-8859-10 0x00FF
  +ISO8859_11 ISO-8859-11 0x00FF
  +ISO8859-11 ISO-8859-11 0x00FF
  +ISO8859_12 ISO-8859-12 0x00FF
  +ISO8859-12 ISO-8859-12 0x00FF
  +ISO8859_13 ISO-8859-13 0x00FF
  +ISO8859-13 ISO-8859-13 0x00FF
  +ISO8859_14 ISO-8859-14 0x00FF
  +ISO8859-14 ISO-8859-14 0x00FF
  +ISO8859_15 ISO-8859-15 0x00FF
  +ISO8859-15 ISO-8859-15 0x00FF
   # # ?
   8859_1 ISO-8859-1 0x00FF
   8859_2 ISO-8859-2 0x00FF
  
  
  
  1.43      +3 -3      xml-xalan/java/src/org/apache/xml/serializer/ToHTMLStream.java
  
  Index: ToHTMLStream.java
  ===================================================================
  RCS file: /home/cvs/xml-xalan/java/src/org/apache/xml/serializer/ToHTMLStream.java,v
  retrieving revision 1.42
  retrieving revision 1.43
  diff -u -r1.42 -r1.43
  --- ToHTMLStream.java	15 Jul 2005 16:11:37 -0000	1.42
  +++ ToHTMLStream.java	3 Aug 2005 19:20:31 -0000	1.43
  @@ -1173,7 +1173,7 @@
                           writer.write('%');
                           writer.write(makeHHString(low));
                       }
  -                    else if (isUTF16Surrogate(ch)) // high surrogate
  +                    else if (Encodings.isHighUTF16Surrogate(ch)) // high surrogate
                       {
                           // I'm sure this can be done in 3 instructions, but I choose 
                           // to try and do it exactly like it is done in the book, at least 
  @@ -1380,7 +1380,7 @@
                   }
                   else
                   {
  -                    if (isUTF16Surrogate(ch))
  +                    if (Encodings.isHighUTF16Surrogate(ch))
                       {
    
                               writeUTF16Surrogate(ch, chars, i, end);
  
  
  
  1.44      +97 -154   xml-xalan/java/src/org/apache/xml/serializer/ToStream.java
  
  Index: ToStream.java
  ===================================================================
  RCS file: /home/cvs/xml-xalan/java/src/org/apache/xml/serializer/ToStream.java,v
  retrieving revision 1.43
  retrieving revision 1.44
  diff -u -r1.43 -r1.44
  --- ToStream.java	1 Jun 2005 19:17:08 -0000	1.43
  +++ ToStream.java	3 Aug 2005 19:20:31 -0000	1.44
  @@ -54,22 +54,20 @@
       /** Stack to keep track of disabling output escaping. */
       protected BoolStack m_disableOutputEscapingStates = new BoolStack();
   
  -    /**
  -     * Boolean that tells if we already tried to get the converter.
  -     */
  -    boolean m_triedToGetConverter = false;
  -    /**
  -     * Method reference to the sun.io.CharToByteConverter#canConvert method 
  -     * for this encoding.  Invalid if m_charToByteConverter is null.
  -     */
  -    java.lang.reflect.Method m_canConvertMeth;
   
       /**
  -     * Opaque reference to the sun.io.CharToByteConverter for this 
  -     * encoding.
  +     * The encoding information associated with this serializer.
  +     * Although initially there is no encoding,
  +     * there is a dummy EncodingInfo object that will say
  +     * that every character is in the encoding. This is useful
  +     * for a serializer that is in temporary output state and has
  +     * no associated encoding. A serializer in final output state
  +     * will have an encoding, and will worry about whether 
  +     * single chars or surrogate pairs of high/low chars form
  +     * characters in the output encoding. 
        */
  -    Object m_charToByteConverter = null;
  -
  +    EncodingInfo m_encodingInfo = new EncodingInfo(null,null);
  +    
       /**
        * Stack to keep track of whether or not we need to
        * preserve whitespace.
  @@ -101,11 +99,6 @@
        */
       protected boolean m_isprevtext = false;
   
  -    /**
  -     * The maximum character size before we have to resort
  -     * to escaping.
  -     */
  -    protected int m_maxCharacter = Encodings.getLastPrintable();
   
       /**
        * The system line separator for writing out line breaks.
  @@ -224,18 +217,6 @@
       }
   
       /**
  -     * Return true if the character is the high member of a surrogate pair.
  -     *
  -     * NEEDSDOC @param c
  -     *
  -     * NEEDSDOC ($objectName$) @return
  -     */
  -    static final boolean isUTF16Surrogate(char c)
  -    {
  -        return (c & 0xFC00) == 0xD800;
  -    }
  -
  -    /**
        * Taken from XSLTC 
        */
       private boolean m_escaping = true;
  @@ -515,7 +496,6 @@
           }
   
           m_isUTF8 = encoding.equals(Encodings.DEFAULT_MIME_ENCODING);
  -        m_maxCharacter = Encodings.getLastPrintable(encoding);
   
           // Access this only from the Hashtable level... we don't want to 
           // get default properties.
  @@ -629,8 +609,6 @@
                   osw = Encodings.getWriter(output, encoding);
               }
   
  -            m_maxCharacter = Encodings.getLastPrintable(encoding);
  -
               init(osw, format, defaultProperties, true);
           }
   
  @@ -891,140 +869,102 @@
        */
       protected boolean escapingNotNeeded(char ch)
       {
  +        final boolean ret;
           if (ch < 127)
           {
  +            // This is the old/fast code here, but is this 
  +            // correct for all encodings?
               if (ch >= 0x20 || (0x0A == ch || 0x0D == ch || 0x09 == ch))
  -                return true;
  +                ret= true;
               else
  -                return false;
  -        }
  -
  -        if (null == m_charToByteConverter && false == m_triedToGetConverter)
  -        {
  -            m_triedToGetConverter = true;
  -            try
  -            {
  -                m_charToByteConverter =
  -                    Encodings.getCharToByteConverter(getEncoding());
  -                if (null != m_charToByteConverter)
  -                {
  -                    Class argsTypes[] = new Class[1];
  -                    argsTypes[0] = Character.TYPE;
  -                    Class convClass = m_charToByteConverter.getClass();
  -                    m_canConvertMeth =
  -                        convClass.getMethod("canConvert", argsTypes);
  -                }
  -            }
  -            catch (Exception e)
  -            {
  -                // This is just an assert: no action at the moment.
  -                System.err.println("Warning: " + e.getMessage());
  -            }
  +                ret = false;
           }
  -        if (null != m_charToByteConverter)
  -        {
  -            try
  -            {
  -                Object args[] = new Object[1];
  -                args[0] = new Character(ch);
  -                Boolean bool =
  -                    (Boolean) m_canConvertMeth.invoke(
  -                        m_charToByteConverter,
  -                        args);
  -                return bool.booleanValue()
  -                    ? !Character.isISOControl(ch)
  -                    : false;
  -            }
  -            catch (java.lang.reflect.InvocationTargetException ite)
  -            {
  -                // This is just an assert: no action at the moment.
  -                System.err.println(
  -                    "Warning: InvocationTargetException in canConvert!");
  -            }
  -            catch (java.lang.IllegalAccessException iae)
  -            {
  -                // This is just an assert: no action at the moment.
  -                System.err.println(
  -                    "Warning: IllegalAccessException in canConvert!");
  -            }
  +        else {            
  +            ret = m_encodingInfo.isInEncoding(ch);
           }
  -        // fallback!
  -        return (ch <= m_maxCharacter);
  +        return ret;
       }
   
       /**
        * Once a surrogate has been detected, write out the pair of
  -     * characters as a single character reference.
  +     * characters if it is in the encoding, or if there is no
  +     * encoding, otherwise write out an entity reference
  +     * of the value of the unicode code point of the character
  +     * represented by the high/low surrogate pair.
  +     * <p>
  +     * An exception is thrown if there is no low surrogate in the pair,
  +     * because the array ends unexpectely, or if the low char is there
  +     * but its value is such that it is not a low surrogate.
        *
  -     * @param c the first part of the surrogate.
  +     * @param c the first (high) part of the surrogate, which
  +     * must be confirmed before calling this method.
        * @param ch Character array.
        * @param i position Where the surrogate was detected.
        * @param end The end index of the significant characters.
  +     * @return 0 if the pair of characters was written out as-is,
  +     * the unicode code point of the character represented by
  +     * the surrogate pair if an entity reference with that value
  +     * was written out. 
  +     * 
        * @throws IOException
        * @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected.
        */
  -    protected void writeUTF16Surrogate(char c, char ch[], int i, int end)
  +    protected int writeUTF16Surrogate(char c, char ch[], int i, int end)
           throws IOException
       {
  -
  -        // UTF-16 surrogate
  -        int surrogateValue = getURF16SurrogateValue(c, ch, i, end);
  -
  -        final java.io.Writer writer = m_writer;
  -        writer.write('&');
  -        writer.write('#');
  -
  -        // writer.write('x');
  -        writer.write(Integer.toString(surrogateValue));
  -        writer.write(';');
  -    }
  -
  -    /**
  -     * Once a surrogate has been detected, get the pair as a single integer
  -     * value.
  -     *
  -     * @param c the first part of the surrogate.
  -     * @param ch Character array.
  -     * @param i position Where the surrogate was detected.
  -     * @param end The end index of the significant characters.
  -     * @return the integer value of the UTF-16 surrogate.
  -     * @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected.
  -     */
  -    int getURF16SurrogateValue(char c, char ch[], int i, int end)
  -        throws IOException
  -    {
  -
  -        int next;
  -
  +        int codePoint = 0;
           if (i + 1 >= end)
           {
               throw new IOException(
                   Utils.messages.createMessage(
                       MsgKey.ER_INVALID_UTF16_SURROGATE,
                       new Object[] { Integer.toHexString((int) c)}));
  -            //"Invalid UTF-16 surrogate detected: "
  -
  -            //+Integer.toHexString((int)c)+ " ?");
           }
  -        else
  -        {
  -            next = ch[++i];
  -
  -            if (!(0xdc00 <= next && next < 0xe000))
  -                throw new IOException(
  -                    Utils.messages.createMessage(
  -                        MsgKey.ER_INVALID_UTF16_SURROGATE,
  -                        new Object[] {
  -                            Integer.toHexString((int) c)
  -                                + " "
  -                                + Integer.toHexString(next)}));
  -            //"Invalid UTF-16 surrogate detected: "
  -
  -            //+Integer.toHexString((int)c)+" "+Integer.toHexString(next));
  -            next = ((c - 0xd800) << 10) + next - 0xdc00 + 0x00010000;
  +        
  +        final char high = c;
  +        final char low = ch[i+1];
  +        if (!Encodings.isLowUTF16Surrogate(low)) {
  +            throw new IOException(
  +                Utils.messages.createMessage(
  +                    MsgKey.ER_INVALID_UTF16_SURROGATE,
  +                    new Object[] {
  +                        Integer.toHexString((int) c)
  +                            + " "
  +                            + Integer.toHexString(low)}));
           }
   
  -        return next;
  +        final java.io.Writer writer = m_writer;
  +                
  +        // If we make it to here we have a valid high, low surrogate pair
  +        if (m_encodingInfo.isInEncoding(c,low)) {
  +            // If the character formed by the surrogate pair
  +            // is in the encoding, so just write it out
  +            writer.write(ch,i,2);
  +        }
  +        else {
  +            // Don't know what to do with this char, it is
  +            // not in the encoding and not a high char in
  +            // a surrogate pair, so write out as an entity ref
  +            final String encoding = getEncoding();
  +            if (encoding != null) {
  +                /* The output encoding is known, 
  +                 * so somthing is wrong.
  +                  */
  +                codePoint = Encodings.toCodePoint(high, low);
  +                // not in the encoding, so write out a character reference
  +                writer.write('&');
  +                writer.write('#');
  +                writer.write(Integer.toString(codePoint));
  +                writer.write(';');
  +            } else {
  +                /* The output encoding is not known,
  +                 * so just write it out as-is.
  +                 */
  +                writer.write(ch, i, 2);
  +            }
  +        }
  +        // non-zero only if character reference was written out.
  +        return codePoint;
       }
   
       /**
  @@ -1119,7 +1059,7 @@
                       closeCDATA();
   
                   // This needs to go into a function... 
  -                if (isUTF16Surrogate(c))
  +                if (Encodings.isHighUTF16Surrogate(c))
                   {
                       writeUTF16Surrogate(c, ch, i, end);
                       i++ ; // process two input characters
  @@ -1165,7 +1105,7 @@
                   }
   
                   // This needs to go into a function... 
  -                else if (isUTF16Surrogate(c))
  +                else if (Encodings.isHighUTF16Surrogate(c))
                   {
                       if (m_cdataTagOpen)
                           closeCDATA();
  @@ -1664,11 +1604,13 @@
   
           if (i == pos)
           {
  -            if (0xd800 <= ch && ch < 0xdc00)
  +            if (Encodings.isHighUTF16Surrogate(ch))
               {
   
  -                // UTF-16 surrogate
  -                int next;
  +                // Should be the UTF-16 low surrogate of the hig/low pair.
  +                char next;
  +                // Unicode code point formed from the high/low pair.
  +                int codePoint = 0;
   
                   if (i + 1 >= len)
                   {
  @@ -1684,7 +1626,7 @@
                   {
                       next = chars[++i];
   
  -                    if (!(0xdc00 <= next && next < 0xe000))
  +                    if (!(Encodings.isLowUTF16Surrogate(next)))
                           throw new IOException(
                               Utils.messages.createMessage(
                                   MsgKey
  @@ -1696,11 +1638,11 @@
                       //"Invalid UTF-16 surrogate detected: "
   
                       //+Integer.toHexString(ch)+" "+Integer.toHexString(next));
  -                    next = ((ch - 0xd800) << 10) + next - 0xdc00 + 0x00010000;
  +                    codePoint = Encodings.toCodePoint(ch,next);
                   }
   
                   writer.write("&#");
  -                writer.write(Integer.toString(next));
  +                writer.write(Integer.toString(codePoint));
                   writer.write(';');
                   pos += 2; // count the two characters that went into writing out this entity
               }
  @@ -3037,7 +2979,6 @@
        */
       private void resetToStream()
       {
  -         this.m_canConvertMeth = null;
            this.m_cdataStartCalled = false;
            /* The stream is being reset. It is one of
             * ToXMLStream, ToHTMLStream ... and this type can't be changed
  @@ -3046,7 +2987,7 @@
             * 
             */
            // this.m_charInfo = null; // don't set to null 
  -         this.m_charToByteConverter = null;
  +
            this.m_disableOutputEscapingStates.clear();
            
            this.m_escaping = true;
  @@ -3057,12 +2998,10 @@
            this.m_ispreserve = false;
            this.m_isprevtext = false;
            this.m_isUTF8 = false; //  ?? used anywhere ??
  -         this.m_maxCharacter = Encodings.getLastPrintable();
            this.m_preserves.clear();
            this.m_shouldFlush = true;
            this.m_spaceBeforeClose = false;
            this.m_startNewLine = false;
  -         this.m_triedToGetConverter = false;
            this.m_lineSepUse = true;
            // DON'T SET THE WRITER TO NULL, IT MAY BE REUSED !!
            // this.m_writer = null;  
  @@ -3076,8 +3015,12 @@
         */
        public void setEncoding(String encoding)
        {
  -         super.setEncoding(encoding);         
  -         m_maxCharacter = Encodings.getLastPrintable(encoding);
  +         String old = getEncoding();
  +         super.setEncoding(encoding); 
  +         if (old == null || !old.equals(encoding)) {        
  +            // If we have changed the setting of the 
  +            m_encodingInfo = Encodings.getEncodingInfo(encoding);
  +         }
            return;
        }
        
  
  
  
  1.5       +456 -14   xml-xalan/java/src/org/apache/xml/serializer/EncodingInfo.java
  
  Index: EncodingInfo.java
  ===================================================================
  RCS file: /home/cvs/xml-xalan/java/src/org/apache/xml/serializer/EncodingInfo.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- EncodingInfo.java	14 Oct 2004 21:45:05 -0000	1.4
  +++ EncodingInfo.java	3 Aug 2005 19:20:31 -0000	1.5
  @@ -17,14 +17,45 @@
    * $Id$
    */
   package org.apache.xml.serializer;
  +
  +import java.io.UnsupportedEncodingException;
  +
   /**
    * Holds information about a given encoding, which is the Java name for the
  - * encoding, the equivalent ISO name, and the integer value of the last pritable
  - * character in the encoding.
  + * encoding, the equivalent ISO name.
  + * <p>
  + * An object of this type has two useful methods
  + * <pre>
  + * isInEncoding(char ch);
  + * </pre>
  + * which can be called if the character is not the high one in
  + * a surrogate pair and:
  + * <pre>
  + * isInEncoding(char high, char low);
  + * </pre>
  + * which can be called if the two characters from a high/low surrogate pair.
  + * <p>
  + * An EncodingInfo object is a node in a binary search tree. Such a node
  + * will answer if a character is in the encoding, and do so for a given
  + * range of unicode values (<code>m_first</code> to
  + * <code>m_last</code>). It will handle a certain range of values
  + * explicitly (<code>m_explFirst</code> to <code>m_explLast</code>).
  + * If the unicode point is before that explicit range, that is it
  + * is in the range <code>m_first <= value < m_explFirst</code>, then it will delegate to another EncodingInfo object for The root
  + * of such a tree, m_before.  Likewise for values in the range 
  + * <code>m_explLast < value <= m_last</code>, but delgating to <code>m_after</code>
  + * <p>
  + * Actually figuring out if a code point is in the encoding is expensive. So the
  + * purpose of this tree is to cache such determinations, and not to build the
  + * entire tree of information at the start, but only build up as much of the 
  + * tree as is used during the transformation.
  + * <p>
  + * This Class is not a public API, and should only be used internally within
  + * the serializer.
    * 
    * @xsl.usage internal
    */
  -final class EncodingInfo extends Object
  +public final class EncodingInfo extends Object
   {
   
       /**
  @@ -36,25 +67,436 @@
        * The name used by the Java convertor.
        */
       final String javaName;
  -
  +    
  +    /**
  +     * A helper object that we can ask if a
  +     * single char, or a surrogate UTF-16 pair
  +     * of chars that form a single character,
  +     * is in this encoding.
  +     */
  +    private InEncoding m_encoding;
  +    
       /**
  -     * The last printable character.
  +     * This is not a public API. It returns true if the
  +     * char in question is in the encoding.
  +     * @param ch the char in question.
  +     * @xsl.usage internal
        */
  -    final int lastPrintable;
  +    public boolean isInEncoding(char ch) {
  +        if (m_encoding == null) {
  +            m_encoding = new EncodingImpl();
  +            
  +            // One could put alternate logic in here to
  +            // instantiate another object that implements the
  +            // InEncoding interface. For example if the JRE is 1.4 or up
  +            // we could have an object that uses JRE 1.4 methods
  +        }
  +        return m_encoding.isInEncoding(ch); 
  +    }
  +    
  +    /**
  +     * This is not a public API. It returns true if the
  +     * character formed by the high/low pair is in the encoding.
  +     * @param high a char that the a high char of a high/low surrogate pair.
  +     * @param low a char that is the low char of a high/low surrogate pair.
  +     * @xsl.usage internal
  +     */
  +    public boolean isInEncoding(char high, char low) {
  +        if (m_encoding == null) {
  +            m_encoding = new EncodingImpl();
  +            
  +            // One could put alternate logic in here to
  +            // instantiate another object that implements the
  +            // InEncoding interface. For example if the JRE is 1.4 or up
  +            // we could have an object that uses JRE 1.4 methods
  +        }
  +        return m_encoding.isInEncoding(high, low); 
  +    }
   
       /**
  -     * Create an EncodingInfo object based on the name, java name, and the
  -     * max character size.
  +     * Create an EncodingInfo object based on the ISO name and Java name.
  +     * If both parameters are null any character will be considered to
  +     * be in the encoding. This is useful for when the serializer is in
  +     * temporary output state, and has no assciated encoding.
        *
  -     * @param name non-null reference to the ISO name.
  -     * @param javaName non-null reference to the Java encoding name.
  -     * @param lastPrintable The maximum character that can be written.
  +     * @param name reference to the ISO name.
  +     * @param javaName reference to the Java encoding name.
        */
  -    public EncodingInfo(String name, String javaName, int lastPrintable)
  +    public EncodingInfo(String name, String javaName)
       {
   
           this.name = name;
           this.javaName = javaName;
  -        this.lastPrintable = lastPrintable;
       }
  +    
  +    
  +    
  +    /**
  +     * A simple interface to isolate the implementation.
  +     * We could also use some new JRE 1.4 methods in another implementation
  +     * provided we use reflection with them.
  +     * <p>
  +     * This interface is not a public API,
  +     * and should only be used internally within the serializer. 
  +     * @xsl.usage internal
  +     */
  +    private interface InEncoding {
  +        /**
  +         * Returns true if the char is in the encoding
  +         */
  +        public boolean isInEncoding(char ch);
  +        /**
  +         * Returns true if the high/low surrogate pair forms
  +         * a character that is in the encoding.
  +         */
  +        public boolean isInEncoding(char high, char low);
  +    }
  +
  +    /**
  +     * This class implements the 
  +     */
  +    private class EncodingImpl implements InEncoding {
  +        
  +
  +
  +        public boolean isInEncoding(char ch1) {
  +            final boolean ret;
  +            int codePoint = Encodings.toCodePoint(ch1);
  +            if (codePoint < m_explFirst) {
  +                // The unicode value is before the range
  +                // that we explictly manage, so we delegate the answer.
  +                
  +                // If we don't have an m_before object to delegate to, make one.
  +                if (m_before == null)
  +                    m_before =
  +                        new EncodingImpl(
  +                            m_encoding,
  +                            m_first,
  +                            m_explFirst - 1,
  +                            codePoint);
  +                ret = m_before.isInEncoding(ch1);
  +            } else if (m_explLast < codePoint) {
  +                // The unicode value is after the range
  +                // that we explictly manage, so we delegate the answer.
  +                
  +                // If we don't have an m_after object to delegate to, make one.
  +                if (m_after == null)
  +                    m_after =
  +                        new EncodingImpl(
  +                            m_encoding,
  +                            m_explLast + 1,
  +                            m_last,
  +                            codePoint);
  +                ret = m_after.isInEncoding(ch1);
  +            } else {
  +                // The unicode value is in the range we explitly handle
  +                final int idx = codePoint - m_explFirst;
  +                
  +                // If we already know the answer, just return it.
  +                if (m_alreadyKnown[idx])
  +                    ret = m_isInEncoding[idx];
  +                else {
  +                    // We don't know the answer, so find out,
  +                    // which may be expensive, then cache the answer 
  +                    ret = inEncoding(ch1, m_encoding);
  +                    m_alreadyKnown[idx] = true;
  +                    m_isInEncoding[idx] = ret;
  +                }
  +            }
  +            return ret;
  +        }
  +
  +        public boolean isInEncoding(char high, char low) {
  +            final boolean ret;
  +            int codePoint = Encodings.toCodePoint(high,low);
  +            if (codePoint < m_explFirst) {
  +                // The unicode value is before the range
  +                // that we explictly manage, so we delegate the answer.
  +                
  +                // If we don't have an m_before object to delegate to, make one.
  +                if (m_before == null)
  +                    m_before =
  +                        new EncodingImpl(
  +                            m_encoding,
  +                            m_first,
  +                            m_explFirst - 1,
  +                            codePoint);
  +                ret = m_before.isInEncoding(high,low);
  +            } else if (m_explLast < codePoint) {
  +                // The unicode value is after the range
  +                // that we explictly manage, so we delegate the answer.
  +                
  +                // If we don't have an m_after object to delegate to, make one.
  +                if (m_after == null)
  +                    m_after =
  +                        new EncodingImpl(
  +                            m_encoding,
  +                            m_explLast + 1,
  +                            m_last,
  +                            codePoint);
  +                ret = m_after.isInEncoding(high,low);
  +            } else {
  +                // The unicode value is in the range we explitly handle
  +                final int idx = codePoint - m_explFirst;
  +                
  +                // If we already know the answer, just return it.
  +                if (m_alreadyKnown[idx])
  +                    ret = m_isInEncoding[idx];
  +                else {
  +                    // We don't know the answer, so find out,
  +                    // which may be expensive, then cache the answer 
  +                    ret = inEncoding(high, low, m_encoding);
  +                    m_alreadyKnown[idx] = true;
  +                    m_isInEncoding[idx] = ret;
  +                }
  +            }
  +            return ret;
  +        }
  +
  +        /**
  +         * The encoding.
  +         */
  +        final private String m_encoding;
  +        /**
  +         * m_first through m_last is the range of unicode
  +         * values that this object will return an answer on.
  +         * It may delegate to a similar object with a different
  +         * range
  +         */
  +        final private int m_first;
  +        
  +        /**
  +         * m_explFirst through m_explLast is the range of unicode
  +         * value that this object handles explicitly and does not
  +         * delegate to a similar object.
  +         */
  +        final private int m_explFirst;
  +        final private int m_explLast;
  +        final private int m_last;
  +
  +        /**
  +         * The object, of the same type as this one,
  +         * that handles unicode values in a range before
  +         * the range explictly handled by this object, and
  +         * to which this object may delegate.
  +         */
  +        private InEncoding m_before;
  +        /**
  +         * The object, of the same type as this one,
  +         * that handles unicode values in a range after
  +         * the range explictly handled by this object, and
  +         * to which this object may delegate.
  +         */
  +        private InEncoding m_after;
  +        
  +        /**
  +         * The number of unicode values explicitly handled
  +         * by a single EncodingInfo object. This value is 
  +         * tuneable, but is set to 128 because that covers the
  +         * entire low range of ASCII type chars within a single
  +         * object.
  +         */
  +        private static final int RANGE = 128;
  +
  +        /**
  +         * A flag to record if we already know the answer
  +         * for the given unicode value.
  +         */
  +        final private boolean m_alreadyKnown[] = new boolean[RANGE];
  +        /**
  +         * A table holding the answer on whether the given unicode
  +         * value is in the encoding.
  +         */
  +        final private boolean m_isInEncoding[] = new boolean[RANGE];
  +        
  +        private EncodingImpl() {
  +            // This object will answer whether any unicode value
  +            // is in the encoding, it handles values 0 through Integer.MAX_VALUE
  +            this(javaName, 0, Integer.MAX_VALUE, (char) 0);
  +        }
  +
  +        private EncodingImpl(String encoding, int first, int last, int codePoint) {
  +            // Set the range of unicode values that this object manages
  +            // either explicitly or implicitly.
  +            m_first = first;
  +            m_last = last;  
  +                      
  +            // Set the range of unicode values that this object 
  +            // explicitly manages
  +            m_explFirst = codePoint;
  +            m_explLast = codePoint + (RANGE-1);  
  +            
  +            m_encoding = encoding;
  +            
  +            if (javaName != null)
  +            {
  +                // Some optimization.
  +                if (0 <= m_explFirst && m_explFirst <= 127) {
  +                    // This particular EncodingImpl explicitly handles
  +                    // characters in the low range.
  +                    if ("UTF8".equals(javaName)
  +                        || "UTF-16".equals(javaName)
  +                        || "ASCII".equals(javaName)
  +                        || "US-ASCII".equals(javaName)
  +                        || "Unicode".equals(javaName)
  +                        || "UNICODE".equals(javaName)
  +                        || javaName.startsWith("ISO8859")) {
  +                        
  +                        // Not only does this EncodingImpl object explicitly
  +                        // handle chracters in the low range, it is
  +                        // also one that we know something about, without
  +                        // needing to call inEncoding(char ch, String encoding)
  +                        // for this low range
  +                        //
  +                        // By initializing the table ahead of time
  +                        // for these low values, we prevent the expensive
  +                        // inEncoding(char ch, String encoding)
  +                        // from being called, at least for these common
  +                        // encodings.
  +                        for (int unicode = 1; unicode < 127; unicode++) {
  +                            final int idx = unicode - m_explFirst;
  +                            if (0 <= idx && idx < RANGE) {
  +                                m_alreadyKnown[idx] = true;
  +                                m_isInEncoding[idx] = true;
  +                            }
  +                        }
  +                    }
  +                }
  +
  +                /* A little bit more than optimization.
  +                 * 
  +                 * We will say that any character is in the encoding if
  +                 * we don't have an encoding.
  +                 * This is meaningful when the serializer is being used
  +                 * in temporary output state, where we are not writing to
  +                 * the final output tree.  It is when writing to the
  +                 * final output tree that we need to worry about the output
  +                 * encoding
  +                 */
  +                if (javaName == null) {
  +                    for (int idx = 0; idx < m_alreadyKnown.length; idx++) {
  +                        m_alreadyKnown[idx] = true;
  +                        m_isInEncoding[idx] = true;
  +                    }
  +                }
  +            }
  +        }
  +    }
  +
  +    /**
  +     * This is heart of the code that determines if a given character
  +     * is in the given encoding. This method is probably expensive,
  +     * and the answer should be cached.
  +     * <p>
  +     * This method is not a public API,
  +     * and should only be used internally within the serializer.
  +     * @param ch the char in question, that is not a high char of
  +     * a high/low surrogate pair.
  +     * @param encoding the Java name of the enocding.
  +     * 
  +     * @xsl.usage internal
  +     * 
  +     */
  +    private static boolean inEncoding(char ch, String encoding) {
  +        boolean isInEncoding;
  +        try {
  +            char cArray[] = new char[1];
  +            cArray[0] = ch;
  +            // Construct a String from the char 
  +            String s = new String(cArray);
  +            // Encode the String into a sequence of bytes 
  +            // using the given, named charset. 
  +            byte[] bArray = s.getBytes(encoding);
  +            isInEncoding = inEncoding(ch, bArray);
  +
  +        } catch (Exception e) {
  +            isInEncoding = false;
  +        }
  +        return isInEncoding;
  +    }
  +    
  +    /**
  +     * This is heart of the code that determines if a given high/low
  +     * surrogate pair forms a character that is in the given encoding.
  +     * This method is probably expensive, and the answer should be cached. 
  +     * <p>
  +     * This method is not a public API,
  +     * and should only be used internally within the serializer.
  +     * @param high the high char of
  +     * a high/low surrogate pair.
  +     * @param low the low char of a high/low surrogate pair.
  +     * @param encoding the Java name of the encoding.
  +     * 
  +     * @xsl.usage internal
  +     * 
  +     */ 
  +    private static boolean inEncoding(char high, char low, String encoding) {
  +        boolean isInEncoding;
  +        try {
  +            char cArray[] = new char[2];
  +            cArray[0] = high;
  +            cArray[1] = low;
  +            // Construct a String from the char 
  +            String s = new String(cArray);
  +            // Encode the String into a sequence of bytes 
  +            // using the given, named charset. 
  +            byte[] bArray = s.getBytes(encoding);
  +            isInEncoding = inEncoding(high,bArray);
  +        } catch (Exception e) {
  +            isInEncoding = false;
  +        }
  +        
  +        return isInEncoding;
  +    } 
  +    
  +    /**
  +     * This method is the core of determining if character
  +     * is in the encoding. The method is not foolproof, because
  +     * s.getBytes(encoding) has specified behavior only if the
  +     * characters are in the specified encoding. However this
  +     * method tries it's best.
  +     * @param ch the char that was converted using getBytes, or
  +     * the first char of a high/low pair that was converted.
  +     * @param data the bytes written out by the call to s.getBytes(encoding);
  +     * @return true if the character is in the encoding.
  +     */
  +    private static boolean inEncoding(char ch, byte[] data) {
  +        final boolean isInEncoding;
  +        // If the string written out as data is not in the encoding,
  +        // the output is not specified according to the documentation
  +        // on the String.getBytes(encoding) method,
  +        // but we do our best here.        
  +        if (data==null || data.length == 0) {
  +            isInEncoding = false;
  +        }
  +        else {
  +            if (data[0] == 0)
  +                isInEncoding = false;
  +            else if (data[0] == '?' && ch != '?')
  +                isInEncoding = false;
  +            /*
  +             * else if (isJapanese) {
  +             *   // isJapanese is really 
  +             *   //   (    "EUC-JP".equals(javaName) 
  +             *   //    ||  "EUC_JP".equals(javaName)
  +             *  //     ||  "SJIS".equals(javaName)   )
  +             * 
  +             *   // Work around some bugs in JRE for Japanese
  +             *   if(data[0] == 0x21)
  +             *     isInEncoding = false;
  +             *   else if (ch == 0xA5)
  +             *     isInEncoding = false;
  +             *   else
  +             *     isInEncoding = true;
  +             * }
  +             */ 
  +                
  +            else {
  +                // We don't know for sure, but it looks like it is in the encoding
  +                isInEncoding = true; 
  +            }
  +        }
  +        return isInEncoding;
  +    }
  +
   }
  
  
  
  1.21      +2 -2      xml-xalan/java/src/org/apache/xml/serializer/ToXMLStream.java
  
  Index: ToXMLStream.java
  ===================================================================
  RCS file: /home/cvs/xml-xalan/java/src/org/apache/xml/serializer/ToXMLStream.java,v
  retrieving revision 1.20
  retrieving revision 1.21
  diff -u -r1.20 -r1.21
  --- ToXMLStream.java	7 Apr 2005 04:29:03 -0000	1.20
  +++ ToXMLStream.java	3 Aug 2005 19:20:31 -0000	1.21
  @@ -97,7 +97,7 @@
           setDoctypePublic(xmlListener.getDoctypePublic());        
           setStandalone(xmlListener.getStandalone());
           setMediaType(xmlListener.getMediaType());
  -        m_maxCharacter = xmlListener.m_maxCharacter;
  +        m_encodingInfo = xmlListener.m_encodingInfo;
           m_spaceBeforeClose = xmlListener.m_spaceBeforeClose;
           m_cdataStartCalled = xmlListener.m_cdataStartCalled;
   
  
  
  
  1.14      +86 -101   xml-xalan/java/src/org/apache/xml/serializer/Encodings.java
  
  Index: Encodings.java
  ===================================================================
  RCS file: /home/cvs/xml-xalan/java/src/org/apache/xml/serializer/Encodings.java,v
  retrieving revision 1.13
  retrieving revision 1.14
  diff -u -r1.13 -r1.14
  --- Encodings.java	7 Apr 2005 06:02:51 -0000	1.13
  +++ Encodings.java	3 Aug 2005 19:20:31 -0000	1.14
  @@ -35,9 +35,7 @@
   
   /**
    * Provides information about encodings. Depends on the Java runtime
  - * to provides writers for the different encodings, but can be used
  - * to override encoding names and provide the last printable character
  - * for each encoding.
  + * to provides writers for the different encodings.
    * 
    * This class is only for internal use within Xalan. However, it is used directly
    * by org.apache.xalan.xsltc.compiler.Output.
  @@ -47,12 +45,6 @@
   
   public final class Encodings extends Object
   {
  -
  -    /**
  -     * The last printable character for unknown encodings.
  -     */
  -    private static final int m_defaultLastPrintable = 0x7F;
  -
       /**
        * Standard filename for properties file with encodings data.
        */
  @@ -63,47 +55,17 @@
        */
       private static final String ENCODINGS_PROP = "org.apache.xalan.serialize.encodings";
   
  -    /** SUN JVM internal ByteToChar converter method */
  -    private static final Method
  -        SUN_CHAR2BYTE_CONVERTER_METHOD = findCharToByteConverterMethod();
  -
  -    private static Method findCharToByteConverterMethod() {
  -        Method method = null;
  -        try
  -        {
  -            method = (Method)
  -            AccessController.doPrivileged(new PrivilegedAction() {
  -                public Object run() {
  -                    try {
  -                        Class charToByteConverterClass = (Class) 
  -                            Class.forName("sun.io.CharToByteConverter");
  -                        Class argTypes[] = {String.class};
  -                        return charToByteConverterClass.getMethod("getConverter", argTypes);
  -                    }
  -                    catch (Exception e) {
  -                        throw new RuntimeException(e.toString());
  -                    }
  -                }});
  -        }
  -        catch (Exception e)
  -        {
  -            System.err.println(
  -                "Warning: Could not get charToByteConverterClass!");
  -            method = null;
  -        }
  -
  -        return method;
  -    }
  -
       /**
        * Returns a writer for the specified encoding based on
        * an output stream.
  -     *
  +     * <p>
  +     * This is not a public API.
        * @param output The output stream
        * @param encoding The encoding
        * @return A suitable writer
        * @throws UnsupportedEncodingException There is no convertor
        *  to support this encoding
  +     * @xsl.usage internal
        */
       static Writer getWriter(OutputStream output, String encoding)
           throws UnsupportedEncodingException
  @@ -142,48 +104,17 @@
       }
   
       /**
  -     * Returns an opaque CharToByte converter for the specified encoding.
  -     *
  -     * @param encoding The encoding
  -     * @return An object which should be a sun.io.CharToByteConverter, or null.
  -     */
  -    static Object getCharToByteConverter(String encoding)
  -    {
  -        if (SUN_CHAR2BYTE_CONVERTER_METHOD == null) {
  -            return null;
  -        }
  -
  -        Object args[] = new Object[1];
  -        for (int i = 0; i < _encodings.length; ++i)
  -        {
  -            if (_encodings[i].name.equalsIgnoreCase(encoding))
  -            {
  -                try
  -                {
  -                    args[0] = _encodings[i].javaName;
  -                    Object converter =
  -                        SUN_CHAR2BYTE_CONVERTER_METHOD.invoke(null, args);
  -                    if (null != converter) 
  -                        return converter;
  -                }
  -                catch (Exception iae)
  -                {
  -                    // keep trying
  -                }
  -            }
  -        }
  -
  -        return null;
  -    }
  -
  -    /**
  -     * Returns the last printable character for the specified
  +     * Returns the EncodingInfo object for the specified
        * encoding.
  +     * <p>
  +     * This is not a public API.
        *
        * @param encoding The encoding
  -     * @return The last printable character
  +     * @return The object that is used to determine if 
  +     * characters are in the given encoding.
  +     * @xsl.usage internal
        */
  -    static int getLastPrintable(String encoding)
  +    static EncodingInfo getEncodingInfo(String encoding)
       {
           EncodingInfo ei;
   
  @@ -191,17 +122,23 @@
           ei = (EncodingInfo) _encodingTableKeyJava.get(normalizedEncoding);
           if (ei == null)
               ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding);
  -        if (ei != null)
  -            return ei.lastPrintable;
  -        return m_defaultLastPrintable;
  +        if (ei == null) {
  +            // We shouldn't have to do this, but just in case.
  +            ei = new EncodingInfo(null,null);
  +        }
  +
  +        return ei;
       }
    
       /**
        * A fast and cheap way to uppercase a String that is
        * only made of printable ASCII characters.
  +     * <p>
  +     * This is not a public API.
        * @param s a String of ASCII characters
        * @return an uppercased version of the input String,
        * possibly the same String.
  +     * @xsl.usage internal
        */
       static private String toUpperCaseFast(final String s) {
   
  @@ -230,17 +167,6 @@
       	return upper;
       }
   
  -    /**
  -     * Returns the last printable character for an unspecified
  -     * encoding.
  -     *
  -     * @return the default size
  -     */
  -    static int getLastPrintable()
  -    {
  -        return m_defaultLastPrintable;
  -    }
  -
       /** The default encoding, ISO style, ISO style.   */
       static final String DEFAULT_MIME_ENCODING = "UTF-8";
   
  @@ -254,11 +180,14 @@
        * whose name does not match the EncName production of the XML Recommendation
        * [XML]. If no encoding attribute is specified, then the XSLT processor should
        * use either UTF-8 or UTF-16."
  +     * <p>
  +     * This is not a public API.
        *
        * @param encoding Reference to java-style encoding string, which may be null,
        * in which case a default will be found.
        *
        * @return The ISO-style encoding string, or null if failure.
  +     * @xsl.usage internal
        */
       static String getMimeEncoding(String encoding)
       {
  @@ -314,10 +243,12 @@
   
       /**
        * Try the best we can to convert a Java encoding to a XML-style encoding.
  -     *
  +     * <p>
  +     * This is not a public API.
        * @param encoding non-null reference to encoding string, java style.
        *
        * @return ISO-style encoding string.
  +     * @xsl.usage internal
        */
       private static String convertJava2MimeEncoding(String encoding)
       {
  @@ -330,10 +261,14 @@
   
       /**
        * Try the best we can to convert a Java encoding to a XML-style encoding.
  +     * <p>
  +     * This is not a public API.
        *
        * @param encoding non-null reference to encoding string, java style.
        *
        * @return ISO-style encoding string.
  +     *
  +     * @xsl.usage internal
        */
       public static String convertMime2JavaEncoding(String encoding)
       {
  @@ -355,6 +290,7 @@
        * System property "encodings" formatted using URL syntax may define an
        * external encodings list. Thanks to Sergey Ushakov for the code
        * contribution!
  +     * @xsl.usage internal
        */
       private static EncodingInfo[] loadEncodingInfo()
       {
  @@ -418,19 +354,19 @@
                   String val = props.getProperty(javaName);
                   int pos = val.indexOf(' ');
                   String mimeName;
  -                int lastPrintable;
  +                //int lastPrintable;
                   if (pos < 0)
                   {
                       // Maybe report/log this problem?
                       //  "Last printable character not defined for encoding " +
                       //  mimeName + " (" + val + ")" ...
                       mimeName = val;
  -                    lastPrintable = 0x00FF;
  +                   // lastPrintable = 0x00FF;
                   }
                   else
                   {
  -                    lastPrintable =
  -                        Integer.decode(val.substring(pos).trim()).intValue();
  +//                    lastPrintable =
  +//                        Integer.decode(val.substring(pos).trim()).intValue();
                       StringTokenizer st =
                           new StringTokenizer(val.substring(0, pos), ",");
                       for (boolean first = true;
  @@ -439,7 +375,7 @@
                       {
                           mimeName = st.nextToken();
                           ret[j] =
  -                            new EncodingInfo(mimeName, javaName, lastPrintable);
  +                            new EncodingInfo(mimeName, javaName);
                           _encodingTableKeyMime.put(
                               mimeName.toUpperCase(),
                               ret[j]);
  @@ -463,6 +399,55 @@
           }
       }
   
  +    /**
  +     * Return true if the character is the high member of a surrogate pair.
  +     * <p>
  +     * This is not a public API.
  +     * @param ch the character to test
  +     * @xsl.usage internal
  +     */
  +    static boolean isHighUTF16Surrogate(char ch) {
  +        return ('\uD800' <= ch && ch <= '\uDBFF');
  +    }
  +    /**
  +     * Return true if the character is the low member of a surrogate pair.
  +     * <p>
  +     * This is not a public API.
  +     * @param ch the character to test
  +     * @xsl.usage internal
  +     */
  +    static boolean isLowUTF16Surrogate(char ch) {
  +        return ('\uDC00' <= ch && ch <= '\uDFFF');
  +    }
  +    /**
  +     * Return the unicode code point represented by the high/low surrogate pair.
  +     * <p>
  +     * This is not a public API.
  +     * @param highSurrogate the high char of the high/low pair
  +     * @param lowSurrogate the low char of the high/low pair
  +     * @xsl.usage internal
  +     */
  +    static int toCodePoint(char highSurrogate, char lowSurrogate) {
  +        int codePoint =
  +            ((highSurrogate - 0xd800) << 10)
  +                + (lowSurrogate - 0xdc00)
  +                + 0x10000;
  +        return codePoint;
  +    }
  +    /**
  +     * Return the unicode code point represented by the char.
  +     * A bit of a dummy method, since all it does is return the char,
  +     * but as an int value.
  +     * <p>
  +     * This is not a public API.
  +     * @param ch the char.
  +     * @xsl.usage internal
  +     */
  +    static int toCodePoint(char ch) {
  +        int codePoint = ch;
  +        return codePoint;
  +    }
  +
       private static final Hashtable _encodingTableKeyJava = new Hashtable();
       private static final Hashtable _encodingTableKeyMime = new Hashtable();
       private static final EncodingInfo[] _encodings = loadEncodingInfo();
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: xalan-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: xalan-cvs-help@xml.apache.org