You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by le...@locus.apache.org on 2000/12/14 20:22:13 UTC

cvs commit: xml-xerces/java/src/org/apache/xml/serialize EncodingInfo.java SieveEncodingInfo.java BaseMarkupSerializer.java Encodings.java HTMLSerializer.java HTMLdtd.java Makefile OutputFormat.java TextSerializer.java XMLSerializer.java

lehors      00/12/14 11:22:11

  Modified:    java/src/org/apache/xerces/readers MIME2Java.java
               java/src/org/apache/xml/serialize BaseMarkupSerializer.java
                        Encodings.java HTMLSerializer.java HTMLdtd.java
                        Makefile OutputFormat.java TextSerializer.java
                        XMLSerializer.java
  Added:       java/src/org/apache/xml/serialize EncodingInfo.java
                        SieveEncodingInfo.java
  Log:
  Applied patch from TAMURA Kent:
  
  > I know Japanese developers are complaining about the current
  > behavior that all Japanese characters are serialized in
  > character references.  I'm changing the serializer to check
  > whether each character can be encoded or not and it prints
  > character references only for unencodable characters.
  
  The following patch and two new files provide:
   o a solution of the problem described above
   o Surrogate pair support
   o "Windows-31J" encoding support for parsing (MIME2Java.java)
  
  and remove invalid encoding name "UNICODE" in Encodings.java.
  
  Revision  Changes    Path
  1.2       +31 -8     xml-xerces/java/src/org/apache/xerces/readers/MIME2Java.java
  
  Index: MIME2Java.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/readers/MIME2Java.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- MIME2Java.java	1999/11/09 01:12:33	1.1
  +++ MIME2Java.java	2000/12/14 19:21:46	1.2
  @@ -469,6 +469,18 @@
    *      </TD>
    *  </TR>
    *  <TR>
  + *      <TD WIDTH="33%">Japanese Windows: An extension of Shift JIS</TD>
  + *      <TD WIDTH="15%">
  + *          <P ALIGN="CENTER">Windows-31J
  + *      </TD>
  + *      <TD WIDTH="12%">
  + *          <P ALIGN="CENTER">MIME
  + *      </TD>
  + *      <TD WIDTH="31%">
  + *          <P ALIGN="CENTER">MS932 (since JDK 1.2)
  + *      </TD>
  + *  </TR>
  + *  <TR>
    *      <TD WIDTH="33%">Chinese: Big5</TD>
    *      <TD WIDTH="15%">
    *          <P ALIGN="CENTER">Big5
  @@ -506,7 +518,7 @@
    *  </TR>
    * </TABLE>
    * 
  - * @version
  + * @version $Id: MIME2Java.java,v 1.2 2000/12/14 19:21:46 lehors Exp $
    * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
    */
   public class MIME2Java {
  @@ -516,9 +528,9 @@
       
       static {
           s_enchash = new Hashtable();
  -        //    <preferred MIME name>, <Java encoding name>
  +        //    <preferred MIME name (uppercase)>, <Java encoding name>
           s_enchash.put("UTF-8", "UTF8");
  -        s_enchash.put("US-ASCII",        "8859_1");    // ?
  +        s_enchash.put("US-ASCII",        "ASCII");
           s_enchash.put("ISO-8859-1",      "8859_1");
           s_enchash.put("ISO-8859-2",      "8859_2");
           s_enchash.put("ISO-8859-3",      "8859_3");
  @@ -530,6 +542,16 @@
           s_enchash.put("ISO-8859-9",      "8859_9");
           s_enchash.put("ISO-2022-JP",     "JIS");
           s_enchash.put("SHIFT_JIS",       "SJIS");
  +        /**
  +         * MS932 is suitable for Windows-31J,
  +         * but JDK 1.1.x does not support MS932.
  +         */
  +        String version = System.getProperty("java.version");
  +        if (version.equals("1.1") || version.startsWith("1.1.")) {
  +            s_enchash.put("WINDOWS-31J",      "SJIS");
  +        } else {
  +            s_enchash.put("WINDOWS-31J",      "MS932");
  +        }
           s_enchash.put("EUC-JP",          "EUCJIS");
           s_enchash.put("GB2312",          "GB2312");
           s_enchash.put("BIG5",            "Big5");
  @@ -560,9 +582,9 @@
                                                   // ISO-2022-CN? ISO-2022-CN-EXT?
                                                   
           s_revhash = new Hashtable();
  -        //    <Java encoding name>, <preferred MIME name>
  +        //    <Java encoding name (uppercase)>, <preferred MIME name>
           s_revhash.put("UTF8", "UTF-8");
  -        //s_revhash.put("8859_1", "US-ASCII");    // ?
  +        s_revhash.put("ASCII", "US-ASCII");
           s_revhash.put("8859_1", "ISO-8859-1");
           s_revhash.put("8859_2", "ISO-8859-2");
           s_revhash.put("8859_3", "ISO-8859-3");
  @@ -574,6 +596,7 @@
           s_revhash.put("8859_9", "ISO-8859-9");
           s_revhash.put("JIS", "ISO-2022-JP");
           s_revhash.put("SJIS", "Shift_JIS");
  +        s_revhash.put("MS932", "WINDOWS-31J");
           s_revhash.put("EUCJIS", "EUC-JP");
           s_revhash.put("GB2312", "GB2312");
           s_revhash.put("BIG5", "Big5");
  @@ -608,7 +631,7 @@
        * Convert a MIME charset name, also known as an XML encoding name, to a Java encoding name.
        * @param   mimeCharsetName Case insensitive MIME charset name: <code>UTF-8, US-ASCII, ISO-8859-1,
        *                          ISO-8859-2, ISO-8859-3, ISO-8859-4, ISO-8859-5, ISO-8859-6,
  -     *                          ISO-8859-7, ISO-8859-8, ISO-8859-9, ISO-2022-JP, Shift_JIS, 
  +     *                          ISO-8859-7, ISO-8859-8, ISO-8859-9, ISO-2022-JP, Shift_JIS, Windows-31J
        *                          EUC-JP, GB2312, Big5, EUC-KR, ISO-2022-KR, KOI8-R,
        *                          EBCDIC-CP-US, EBCDIC-CP-CA, EBCDIC-CP-NL, EBCDIC-CP-DK,
        *                          EBCDIC-CP-NO, EBCDIC-CP-FI, EBCDIC-CP-SE, EBCDIC-CP-IT,
  @@ -626,11 +649,11 @@
       /**
        * Convert a Java encoding name to MIME charset name.
        * Available values of <i>encoding</i> are "UTF8", "8859_1", "8859_2", "8859_3", "8859_4",
  -     * "8859_5", "8859_6", "8859_7", "8859_8", "8859_9", "JIS", "SJIS", "EUCJIS",
  +     * "8859_5", "8859_6", "8859_7", "8859_8", "8859_9", "JIS", "SJIS", "MS932", "EUCJIS",
        * "GB2312", "BIG5", "KSC5601", "ISO2022KR",  "KOI8_R", "CP037", "CP277", "CP278",
        * "CP280", "CP284", "CP285", "CP297", "CP420", "CP424", "CP500", "CP870", "CP871" and "CP918".
        * @param   encoding    Case insensitive Java encoding name: <code>UTF8, 8859_1, 8859_2, 8859_3,
  -     *                      8859_4, 8859_5, 8859_6, 8859_7, 8859_8, 8859_9, JIS, SJIS, EUCJIS,
  +     *                      8859_4, 8859_5, 8859_6, 8859_7, 8859_8, 8859_9, JIS, SJIS, MS932, EUCJIS,
        *                      GB2312, BIG5, KSC5601, ISO2022KR, KOI8_R, CP037, CP277, CP278,
        *                      CP280, CP284, CP285, CP297, CP420, CP424, CP500, CP870, CP871 
        *                      and CP918</code>.
  
  
  
  1.21      +29 -29    xml-xerces/java/src/org/apache/xml/serialize/BaseMarkupSerializer.java
  
  Index: BaseMarkupSerializer.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/BaseMarkupSerializer.java,v
  retrieving revision 1.20
  retrieving revision 1.21
  diff -u -r1.20 -r1.21
  --- BaseMarkupSerializer.java	2000/09/08 01:45:49	1.20
  +++ BaseMarkupSerializer.java	2000/12/14 19:21:49	1.21
  @@ -134,7 +134,7 @@
    * another element.
    *
    *
  - * @version $Revision: 1.20 $ $Date: 2000/09/08 01:45:49 $
  + * @version $Revision: 1.21 $ $Date: 2000/12/14 19:21:49 $
    * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
    * @see Serializer
    * @see DOMSerializer
  @@ -145,14 +145,7 @@
   {
   
   
  -    /**
  -     * Identifies the last printable character in the Unicode range
  -     * that is supported by the encoding used with this serializer.
  -     * For 8-bit encodings this will be either 0x7E or 0xFF.
  -     * For 16-bit encodings this will be 0xFFFF. Characters that are
  -     * not printable will be escaped using character references.
  -     */
  -    private int              _lastPrintable = 0x7E;
  +    private EncodingInfo _encodingInfo;
   
   
       /**
  @@ -294,8 +287,6 @@
   
       public void setOutputByteStream( OutputStream output )
       {
  -        String encoding;
  -
           if ( output == null )
               throw new NullPointerException( "SER001 Argument 'output' is null." );
           _output = output;
  @@ -343,17 +334,12 @@
           // If the output stream has been set, use it to construct
           // the writer. It is possible that the serializer has been
           // reused with the same output stream and different encoding.
  +
  +        _encodingInfo = _format.getEncodingInfo();
  +
           if ( _output != null ) {
  -            if ( _format.getEncoding() == null )
  -                _writer = new OutputStreamWriter( _output );
  -            else
  -                _writer = Encodings.getWriter( _output, _format.getEncoding() );
  +            _writer = _encodingInfo.getWriter(_output);
           }
  -        // Determine the last printable character.
  -        if ( _format.getEncoding() == null )
  -            _lastPrintable = Encodings.getLastPrintable();
  -        else
  -            _lastPrintable = Encodings.getLastPrintable( _format.getEncoding() );
   
           if ( _format.getIndenting() ) {
               _indenting = true;
  @@ -1125,7 +1111,7 @@
        * @param ch Character value
        * @return Character entity name, or null
        */
  -    protected abstract String getEntityRef( char ch );
  +    protected abstract String getEntityRef(int ch);
   
   
       /**
  @@ -1275,7 +1261,7 @@
       }
   
   
  -    protected void printEscaped( char ch )
  +    protected void printEscaped(int ch)
       {
           String charRef;
   
  @@ -1288,16 +1274,21 @@
               _printer.printText( '&' );
               _printer.printText( charRef );
               _printer.printText( ';' );
  -        } else if ( ( ch >= ' ' && ch <= _lastPrintable && ch != 0xF7 ) ||
  +        } else if ( ( ch >= ' ' && _encodingInfo.isPrintable(ch) && ch != 0xF7 ) ||
                       ch == '\n' || ch == '\r' || ch == '\t' ) {
               // If the character is not printable, print as character reference.
               // Non printables are below ASCII space but not tab or line
               // terminator, ASCII delete, or above a certain Unicode threshold.
  -            _printer.printText( ch );
  +            if (ch < 0x10000) {
  +                _printer.printText((char)ch );
  +            } else {
  +                _printer.printText((char)(((ch-0x10000)>>10)+0xd800));
  +                _printer.printText((char)(((ch-0x10000)&0x3ff)+0xdc00));
  +            }
           } else {
  -            _printer.printText( "&#" );
  -            _printer.printText( Integer.toString( ch ) );
  -            _printer.printText( ';' );
  +            _printer.printText("&#x");
  +            _printer.printText(Integer.toHexString(ch));
  +            _printer.printText(';');
           }
       }
   
  @@ -1312,8 +1303,17 @@
        */
       protected void printEscaped( String source )
       {
  -        for ( int i = 0 ; i < source.length() ; ++i )
  -            printEscaped( source.charAt( i ) );
  +        for ( int i = 0 ; i < source.length() ; ++i ) {
  +            int ch = source.charAt(i);
  +            if ((ch & 0xfc00) == 0xd800 && i+1 < source.length()) {
  +                int lowch = source.charAt(i+1);
  +                if ((lowch & 0xfc00) == 0xdc00) {
  +                    ch = 0x10000 + ((ch-0xd800)<<10) + lowch-0xdc00;
  +                    i++;
  +                }
  +            }
  +            printEscaped(ch);
  +        }
       }
   
   
  
  
  
  1.3       +38 -87    xml-xerces/java/src/org/apache/xml/serialize/Encodings.java
  
  Index: Encodings.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/Encodings.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- Encodings.java	2000/08/30 18:59:20	1.2
  +++ Encodings.java	2000/12/14 19:21:50	1.3
  @@ -71,7 +71,7 @@
    * to override encoding names and provide the last printable character
    * for each encoding.
    *
  - * @version $Revision: 1.2 $ $Date: 2000/08/30 18:59:20 $
  + * @version $Id: Encodings.java,v 1.3 2000/12/14 19:21:50 lehors Exp $
    * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
    */
   class Encodings
  @@ -83,104 +83,55 @@
        */
       static final int DefaultLastPrintable = 0x7F;
   
  -
  -    /**
  -     * Returns a writer for the specified encoding based on
  -     * an output stream.
  -     *
  -     * @param output The output stream
  -     * @param encoding The encoding
  -     * @return A suitable writer
  -     * @throws UnsupportedEncodingException There is no convertor
  -     *  to support this encoding
  -     */
  -    static Writer getWriter( OutputStream output, String encoding )
  -        throws UnsupportedEncodingException
  -    {
  -        for ( int i = 0 ; i < _encodings.length ; ++i ) {
  -            if ( _encodings[ i ].name.equals( encoding ) )
  -                return new OutputStreamWriter( output, _encodings[ i ].javaName );
  -        }
  -        return new OutputStreamWriter( output, encoding );
  -    }
  -
  -
       /**
  -     * Returns the last printable character for the specified
  -     * encoding.
  -     *
  -     * @param encoding The encoding
  -     * @return The last printable character
  +     * @param encoding a MIME charset name, or null.
        */
  -    static int getLastPrintable( String encoding )
  -    {
  -        for ( int i = 0 ; i < _encodings.length ; ++i ) {
  -            if ( _encodings[ i ].name.equalsIgnoreCase( encoding ) )
  -                return _encodings[ i ].lastPrintable;
  +    static EncodingInfo getEncodingInfo(String encoding) {
  +        if (encoding == null)
  +            return new EncodingInfo(null, DefaultLastPrintable);
  +        for (int i = 0;  i < _encodings.length;  i++) {
  +            if (_encodings[i].name.equalsIgnoreCase(encoding))
  +                return _encodings[i];
           }
  -        return DefaultLastPrintable;
  +        return new SieveEncodingInfo(encoding, DefaultLastPrintable);
       }
   
  +    static final String JIS_DANGER_CHARS
  +    = "\\\u007e\u007f\u00a2\u00a3\u00a5\u00ac"
  +    +"\u2014\u2015\u2016\u2026\u203e\u203e\u2225\u222f\u301c"
  +    +"\uff3c\uff5e\uffe0\uffe1\uffe2\uffe3";
   
       /**
  -     * Returns the last printable character for an unspecified
  -     * encoding.
  -     */
  -    static int getLastPrintable()
  -    {
  -        return DefaultLastPrintable;
  -    }
  -
  -
  -    /**
  -     * Holds information about a given encoding.
  +     * Constructs a list of all the supported encodings.
        */
  -    static final class EncodingInfo
  -    {
  -       
  +    private static final EncodingInfo[] _encodings = new EncodingInfo[] {
  +        new EncodingInfo("ASCII", 0x7F),
  +        new EncodingInfo("US-ASCII", 0x7F),
  +        new EncodingInfo("ISO-8859-1", 0xFF),
  +        new EncodingInfo("ISO-8859-2", 0xFF),
  +        new EncodingInfo("ISO-8859-3", 0xFF),
  +        new EncodingInfo("ISO-8859-4", 0xFF),
  +        new EncodingInfo("ISO-8859-5", 0xFF),
  +        new EncodingInfo("ISO-8859-6", 0xFF),
  +        new EncodingInfo("ISO-8859-7", 0xFF),
  +        new EncodingInfo("ISO-8859-8", 0xFF),
  +        new EncodingInfo("ISO-8859-9", 0xFF),
           /**
  -         * The encoding name.
  -         */ 
  -        final String name;
  -
  +         * Does JDK's converter supprt surrogates?
  +         * A Java encoding name "UTF-8" is suppoted by JDK 1.2 or later.
  +         */
  +        new EncodingInfo("UTF-8", "UTF8", 0x10FFFF),
           /**
  -         * The name used by the Java convertor.
  +         * JDK 1.1 supports "Shift_JIS" as an alias of "SJIS".
  +         * But JDK 1.2 treats "Shift_JIS" as an alias of "MS932".
  +         * The JDK 1.2's behavior is invalid against IANA registrations.
            */
  -        final String javaName;
  -
  +        new SieveEncodingInfo("Shift_JIS", "SJIS", 0x7F, JIS_DANGER_CHARS),
           /**
  -         * The last printable character.
  +         * "MS932" is supported by JDK 1.2 or later.
            */
  -        final int    lastPrintable;
  -
  -        EncodingInfo( String name, String javaName, int lastPrintable )
  -        {
  -            this.name = name;
  -            this.javaName = javaName;
  -            this.lastPrintable = lastPrintable;
  -        }
  -
  -    }
  -
  -
  -    /**
  -     * Constructs a list of all the supported encodings.
  -     */
  -    private static final EncodingInfo[] _encodings = new EncodingInfo[] {
  -        new EncodingInfo( "ASCII", "ASCII", 0x7F ),
  -        new EncodingInfo( "ISO-Latin-1", "ASCII", 0xFF ),
  -        new EncodingInfo( "ISO-8859-1", "ISO8859_1", 0xFF ),
  -        new EncodingInfo( "ISO-8859-2", "ISO8859_2", 0xFF ),
  -        new EncodingInfo( "ISO-8859-3", "ISO8859_3", 0xFF ),
  -        new EncodingInfo( "ISO-8859-4", "ISO8859_4", 0xFF ),
  -        new EncodingInfo( "ISO-8859-5", "ISO8859_5", 0xFF ),
  -        new EncodingInfo( "ISO-8859-6", "ISO8859_6", 0xFF ),
  -        new EncodingInfo( "ISO-8859-7", "ISO8859_7", 0xFF ),
  -        new EncodingInfo( "ISO-8859-8", "ISO8859_8", 0xFF ),
  -        new EncodingInfo( "ISO-8859-9", "ISO8859_9", 0xFF ),
  -        new EncodingInfo( "UTF-8", "UTF8", 0xFFFF ),
  -        new EncodingInfo( "UNICODE", "Unicode", 0xFFFF )
  +        new SieveEncodingInfo("Windows-31J", "MS932", 0x7F, JIS_DANGER_CHARS),
  +        new SieveEncodingInfo("EUC-JP", null, 0x7F, JIS_DANGER_CHARS),
  +        new SieveEncodingInfo("ISO-2022-JP", null, 0x7F, JIS_DANGER_CHARS),
       };
  -
  -
   }
  
  
  
  1.14      +2 -2      xml-xerces/java/src/org/apache/xml/serialize/HTMLSerializer.java
  
  Index: HTMLSerializer.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/HTMLSerializer.java,v
  retrieving revision 1.13
  retrieving revision 1.14
  diff -u -r1.13 -r1.14
  --- HTMLSerializer.java	2000/08/30 18:59:21	1.13
  +++ HTMLSerializer.java	2000/12/14 19:21:51	1.14
  @@ -116,7 +116,7 @@
    * </ul>
    *
    *
  - * @version $Revision: 1.13 $ $Date: 2000/08/30 18:59:21 $
  + * @version $Revision: 1.14 $ $Date: 2000/12/14 19:21:51 $
    * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
    * @see Serializer
    */
  @@ -811,7 +811,7 @@
       }
       
       
  -    protected String getEntityRef( char ch )
  +    protected String getEntityRef(int ch)
       {
           return HTMLdtd.fromChar( ch );
       }
  
  
  
  1.10      +5 -2      xml-xerces/java/src/org/apache/xml/serialize/HTMLdtd.java
  
  Index: HTMLdtd.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/HTMLdtd.java,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- HTMLdtd.java	2000/08/30 18:59:21	1.9
  +++ HTMLdtd.java	2000/12/14 19:21:52	1.10
  @@ -81,7 +81,7 @@
    * first time any of these methods is called for fast and efficient access.
    *
    *
  - * @version $Revision: 1.9 $ $Date: 2000/08/30 18:59:21 $
  + * @version $Revision: 1.10 $ $Date: 2000/12/14 19:21:52 $
    * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
    */
   public final class HTMLdtd
  @@ -353,8 +353,11 @@
        * @param value Character value of entity
        * @return Entity's name or null
        */
  -    public static String fromChar( char value )
  +    public static String fromChar(int value )
       {
  +        if (value > 0xffff)
  +            return null;
  +
           String    name;
           
           initialize();
  
  
  
  1.5       +4 -1      xml-xerces/java/src/org/apache/xml/serialize/Makefile
  
  Index: Makefile
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/Makefile,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- Makefile	2000/02/29 02:02:26	1.4
  +++ Makefile	2000/12/14 19:21:53	1.5
  @@ -14,7 +14,10 @@
   	ElementState.class\
   	HTMLdtd.class\
   	SerializerFactory.class\
  -	SerializerFactoryImpl.class
  +	SerializerFactoryImpl.class \
  +	EncodingInfo.class \
  +	SieveEncodingInfo.class \
  +	Encodings.class
   
   DIRS =
   
  
  
  
  1.10      +25 -2     xml-xerces/java/src/org/apache/xml/serialize/OutputFormat.java
  
  Index: OutputFormat.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/OutputFormat.java,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- OutputFormat.java	2000/08/30 18:59:21	1.9
  +++ OutputFormat.java	2000/12/14 19:21:54	1.10
  @@ -91,7 +91,7 @@
    * </ul>
    *
    *
  - * @version $Revision: 1.9 $ $Date: 2000/08/30 18:59:21 $
  + * @version $Revision: 1.10 $ $Date: 2000/12/14 19:21:54 $
    * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
    *         <a href="mailto:visco@intalio.com">Keith Visco</a>
    * @see Serializer
  @@ -184,6 +184,10 @@
        */
       private String _encoding = Defaults.Encoding;
   
  +    /**
  +     * The EncodingInfo instance for _encoding.
  +     */
  +    private EncodingInfo _encodingInfo = null;
   
       /**
        * The specified media type or null.
  @@ -216,7 +220,6 @@
   
   
       /**
  -<<<<<<< OutputFormat.java
        * Ture if comments should be ommited;
        */
       private boolean _omitComments = false;
  @@ -477,8 +480,28 @@
       public void setEncoding( String encoding )
       {
           _encoding = encoding;
  +        _encodingInfo = null;
  +    }
  +
  +    /**
  +     * Sets the encoding for this output method with an <code>EncodingInfo</code>
  +     * instance.
  +     */
  +    public void setEncoding(EncodingInfo encInfo) {
  +        _encoding = encInfo.getName();
  +        _encodingInfo = encInfo;
       }
   
  +    /**
  +     * Returns an <code>EncodingInfo<code> instance for the encoding.
  +     *
  +     * @see setEncoding
  +     */
  +    public EncodingInfo getEncodingInfo() {
  +        if (_encodingInfo == null)
  +            _encodingInfo = Encodings.getEncodingInfo(_encoding);
  +        return _encodingInfo;
  +    }
   
       /**
        * Returns the specified media type, or null.
  
  
  
  1.8       +2 -2      xml-xerces/java/src/org/apache/xml/serialize/TextSerializer.java
  
  Index: TextSerializer.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/TextSerializer.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- TextSerializer.java	2000/08/30 18:59:22	1.7
  +++ TextSerializer.java	2000/12/14 19:21:55	1.8
  @@ -90,7 +90,7 @@
    * org.xml.sax.DocumentHandler#endDocument}.
    *
    *
  - * @version $Revision: 1.7 $ $Date: 2000/08/30 18:59:22 $
  + * @version $Revision: 1.8 $ $Date: 2000/12/14 19:21:55 $
    * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
    * @see Serializer
    */
  @@ -388,7 +388,7 @@
       }
       
       
  -    protected String getEntityRef( char ch )
  +    protected String getEntityRef( int ch )
       {
           return null;
       }
  
  
  
  1.17      +2 -2      xml-xerces/java/src/org/apache/xml/serialize/XMLSerializer.java
  
  Index: XMLSerializer.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/XMLSerializer.java,v
  retrieving revision 1.16
  retrieving revision 1.17
  diff -u -r1.16 -r1.17
  --- XMLSerializer.java	2000/08/30 18:59:22	1.16
  +++ XMLSerializer.java	2000/12/14 19:21:56	1.17
  @@ -104,7 +104,7 @@
    * spaces at beginning of line will be stripped.
    *
    *
  - * @version $Revision: 1.16 $ $Date: 2000/08/30 18:59:22 $
  + * @version $Revision: 1.17 $ $Date: 2000/12/14 19:21:56 $
    * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
    * @see Serializer
    */
  @@ -652,7 +652,7 @@
       }
   
   
  -    protected String getEntityRef( char ch )
  +    protected String getEntityRef(int ch)
       {
           // Encode special XML characters into the equivalent character references.
           // These five are defined by default for all XML documents.
  
  
  
  1.1                  xml-xerces/java/src/org/apache/xml/serialize/EncodingInfo.java
  
  Index: EncodingInfo.java
  ===================================================================
  /*
   * The Apache Software License, Version 1.1
   *
   *
   * Copyright (c) 2000 The Apache Software Foundation.  All rights 
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer. 
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:  
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Xerces" and "Apache Software Foundation" must
   *    not be used to endorse or promote products derived from this
   *    software without prior written permission. For written 
   *    permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    nor may "Apache" appear in their name, without prior written
   *    permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation and was
   * originally based on software copyright (c) 1999, International
   * Business Machines, Inc., http://www.apache.org.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  
  package org.apache.xml.serialize;
  
  import java.io.OutputStream;
  import java.io.OutputStreamWriter;
  import java.io.UnsupportedEncodingException;
  import java.io.Writer;
  
  /**
   * This class represents an encoding.
   *
   * @version $Id: EncodingInfo.java,v 1.1 2000/12/14 19:21:50 lehors Exp $
   */
  public class EncodingInfo {
  
      String name;
      String javaName;
      int lastPrintable;
  
      /**
       * Creates new <code>EncodingInfo</code> instance.
       */
      public EncodingInfo(String mimeName, String javaName, int lastPrintable) {
          this.name = mimeName;
          this.javaName = javaName == null ? mimeName : javaName;
          this.lastPrintable = lastPrintable;
      }
  
      /**
       * Creates new <code>EncodingInfo</code> instance.
       */
      public EncodingInfo(String mimeName, int lastPrintable) {
          this(mimeName, mimeName, lastPrintable);
      }
  
      /**
       * Returns a MIME charset name of this encoding.
       */
      public String getName() {
          return this.name;
      }
  
      /**
       * Returns a writer for this encoding based on
       * an output stream.
       *
       * @return A suitable writer
       * @exception UnsupportedEncodingException There is no convertor
       *  to support this encoding
       */
      public Writer getWriter(OutputStream output)
          throws UnsupportedEncodingException {
          if (this.javaName == null)
              return new OutputStreamWriter(output);
          return new OutputStreamWriter(output, this.javaName);
      }
      /**
       * Checks whether the specified character is printable or not.
       *
       * @param ch a code point (0-0x10ffff)
       */
      public boolean isPrintable(int ch) {
          return ch <= this.lastPrintable;
      }
  }
  
  
  
  1.1                  xml-xerces/java/src/org/apache/xml/serialize/SieveEncodingInfo.java
  
  Index: SieveEncodingInfo.java
  ===================================================================
  /*
   * The Apache Software License, Version 1.1
   *
   *
   * Copyright (c) 2000 The Apache Software Foundation.  All rights 
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer. 
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:  
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Xerces" and "Apache Software Foundation" must
   *    not be used to endorse or promote products derived from this
   *    software without prior written permission. For written 
   *    permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    nor may "Apache" appear in their name, without prior written
   *    permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation and was
   * originally based on software copyright (c) 1999, International
   * Business Machines, Inc., http://www.apache.org.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  
  package org.apache.xml.serialize;
  
  import java.io.ByteArrayOutputStream;
  import java.io.IOException;
  import java.io.OutputStreamWriter;
  import java.io.Writer;
  
  /**
   * This class represents an encoding.
   *
   * @version $Id: SieveEncodingInfo.java,v 1.1 2000/12/14 19:21:55 lehors Exp $
   */
  public class SieveEncodingInfo extends EncodingInfo {
  
      BAOutputStream checkerStream = null;
      Writer checkerWriter = null;
      String dangerChars = null;
  
      /**
       * Creates new <code>SeiveEncodingInfo</code> instance.
       *
       * @param dangers A sorted characters that are always printed as character references.
       */
      public SieveEncodingInfo(String mimeName, String javaName,
                               int lastPrintable, String dangers) {
          super(mimeName, javaName, lastPrintable);
          this.dangerChars = dangers;
      }
  
      /**
       * Creates new <code>SeiveEncodingInfo</code> instance.
       */
      public SieveEncodingInfo(String mimeName, int lastPrintable) {
          this(mimeName, mimeName, lastPrintable, null);
      }
  
      /**
       * Checks whether the specified character is printable or not.
       *
       * @param ch a code point (0-0x10ffff)
       */
      public boolean isPrintable(int ch) {
          if (this.dangerChars != null && ch <= 0xffff) {
              /**
               * Searches this.dangerChars for ch.
               * TODO: Use binary search.
               */
              if (this.dangerChars.indexOf(ch) >= 0)
                  return false;
          }
  
          if (ch <= this.lastPrintable)
              return true;
  
          boolean printable = true;
          synchronized (this) {
              try {
                  if (this.checkerWriter == null) {
                      this.checkerStream = new BAOutputStream(10);
                      this.checkerWriter = new OutputStreamWriter(this.checkerStream, this.javaName);
                  }
  
                  if (ch > 0xffff) {
                      this.checkerWriter.write(((ch-0x10000)>>10)+0xd800);
                      this.checkerWriter.write(((ch-0x10000)&0x3ff)+0xdc00);
                      byte[] result = this.checkerStream.getBuffer();
                      if (this.checkerStream.size() == 2 && result[0] == '?' && result[1] == '?')
                          printable = false;
                  } else {
                      this.checkerWriter.write(ch);
                      this.checkerWriter.flush();
                      byte[] result = this.checkerStream.getBuffer();
                      if (this.checkerStream.size() == 1 && result[0] == '?')
                          printable = false;
                  }
                  this.checkerStream.reset();
              } catch (IOException ioe) {
                  printable = false;
              }
          }
  
          return printable;
      }
  
      /**
       * Why don't we use the original ByteArrayOutputStream?
       * - Because the toByteArray() method of the ByteArrayOutputStream
       * creates new byte[] instances for each call.
       */
      static class BAOutputStream extends ByteArrayOutputStream {
          BAOutputStream() {
              super();
          }
  
          BAOutputStream(int size) {
              super(size);
          }
  
          byte[] getBuffer() {
              return this.buf;
          }
      }
  
  }