You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by le...@locus.apache.org on 2000/12/14 20:22:13 UTC
cvs commit: xml-xerces/java/src/org/apache/xml/serialize EncodingInfo.java SieveEncodingInfo.java BaseMarkupSerializer.java Encodings.java HTMLSerializer.java HTMLdtd.java Makefile OutputFormat.java TextSerializer.java XMLSerializer.java
lehors 00/12/14 11:22:11
Modified: java/src/org/apache/xerces/readers MIME2Java.java
java/src/org/apache/xml/serialize BaseMarkupSerializer.java
Encodings.java HTMLSerializer.java HTMLdtd.java
Makefile OutputFormat.java TextSerializer.java
XMLSerializer.java
Added: java/src/org/apache/xml/serialize EncodingInfo.java
SieveEncodingInfo.java
Log:
Applied patch from TAMURA Kent:
> I know Japanese developers are complaining about the current
> behavior that all Japanese characters are serialized in
> character references. I'm changing the serializer to check
> whether each character can be encoded or not and it prints
> character references only for unencodable characters.
The following patch and two new files provide:
o a solution of the problem described above
o Surrogate pair support
o "Windows-31J" encoding support for parsing (MIME2Java.java)
and remove invalid encoding name "UNICODE" in Encodings.java.
Revision Changes Path
1.2 +31 -8 xml-xerces/java/src/org/apache/xerces/readers/MIME2Java.java
Index: MIME2Java.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/readers/MIME2Java.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- MIME2Java.java 1999/11/09 01:12:33 1.1
+++ MIME2Java.java 2000/12/14 19:21:46 1.2
@@ -469,6 +469,18 @@
* </TD>
* </TR>
* <TR>
+ * <TD WIDTH="33%">Japanese Windows: An extension of Shift JIS</TD>
+ * <TD WIDTH="15%">
+ * <P ALIGN="CENTER">Windows-31J
+ * </TD>
+ * <TD WIDTH="12%">
+ * <P ALIGN="CENTER">MIME
+ * </TD>
+ * <TD WIDTH="31%">
+ * <P ALIGN="CENTER">MS932 (since JDK 1.2)
+ * </TD>
+ * </TR>
+ * <TR>
* <TD WIDTH="33%">Chinese: Big5</TD>
* <TD WIDTH="15%">
* <P ALIGN="CENTER">Big5
@@ -506,7 +518,7 @@
* </TR>
* </TABLE>
*
- * @version
+ * @version $Id: MIME2Java.java,v 1.2 2000/12/14 19:21:46 lehors Exp $
* @author TAMURA Kent <kent@trl.ibm.co.jp>
*/
public class MIME2Java {
@@ -516,9 +528,9 @@
static {
s_enchash = new Hashtable();
- // <preferred MIME name>, <Java encoding name>
+ // <preferred MIME name (uppercase)>, <Java encoding name>
s_enchash.put("UTF-8", "UTF8");
- s_enchash.put("US-ASCII", "8859_1"); // ?
+ s_enchash.put("US-ASCII", "ASCII");
s_enchash.put("ISO-8859-1", "8859_1");
s_enchash.put("ISO-8859-2", "8859_2");
s_enchash.put("ISO-8859-3", "8859_3");
@@ -530,6 +542,16 @@
s_enchash.put("ISO-8859-9", "8859_9");
s_enchash.put("ISO-2022-JP", "JIS");
s_enchash.put("SHIFT_JIS", "SJIS");
+ /**
+ * MS932 is suitable for Windows-31J,
+ * but JDK 1.1.x does not support MS932.
+ */
+ String version = System.getProperty("java.version");
+ if (version.equals("1.1") || version.startsWith("1.1.")) {
+ s_enchash.put("WINDOWS-31J", "SJIS");
+ } else {
+ s_enchash.put("WINDOWS-31J", "MS932");
+ }
s_enchash.put("EUC-JP", "EUCJIS");
s_enchash.put("GB2312", "GB2312");
s_enchash.put("BIG5", "Big5");
@@ -560,9 +582,9 @@
// ISO-2022-CN? ISO-2022-CN-EXT?
s_revhash = new Hashtable();
- // <Java encoding name>, <preferred MIME name>
+ // <Java encoding name (uppercase)>, <preferred MIME name>
s_revhash.put("UTF8", "UTF-8");
- //s_revhash.put("8859_1", "US-ASCII"); // ?
+ s_revhash.put("ASCII", "US-ASCII");
s_revhash.put("8859_1", "ISO-8859-1");
s_revhash.put("8859_2", "ISO-8859-2");
s_revhash.put("8859_3", "ISO-8859-3");
@@ -574,6 +596,7 @@
s_revhash.put("8859_9", "ISO-8859-9");
s_revhash.put("JIS", "ISO-2022-JP");
s_revhash.put("SJIS", "Shift_JIS");
+ s_revhash.put("MS932", "WINDOWS-31J");
s_revhash.put("EUCJIS", "EUC-JP");
s_revhash.put("GB2312", "GB2312");
s_revhash.put("BIG5", "Big5");
@@ -608,7 +631,7 @@
* Convert a MIME charset name, also known as an XML encoding name, to a Java encoding name.
* @param mimeCharsetName Case insensitive MIME charset name: <code>UTF-8, US-ASCII, ISO-8859-1,
* ISO-8859-2, ISO-8859-3, ISO-8859-4, ISO-8859-5, ISO-8859-6,
- * ISO-8859-7, ISO-8859-8, ISO-8859-9, ISO-2022-JP, Shift_JIS,
+ * ISO-8859-7, ISO-8859-8, ISO-8859-9, ISO-2022-JP, Shift_JIS, Windows-31J
* EUC-JP, GB2312, Big5, EUC-KR, ISO-2022-KR, KOI8-R,
* EBCDIC-CP-US, EBCDIC-CP-CA, EBCDIC-CP-NL, EBCDIC-CP-DK,
* EBCDIC-CP-NO, EBCDIC-CP-FI, EBCDIC-CP-SE, EBCDIC-CP-IT,
@@ -626,11 +649,11 @@
/**
* Convert a Java encoding name to MIME charset name.
* Available values of <i>encoding</i> are "UTF8", "8859_1", "8859_2", "8859_3", "8859_4",
- * "8859_5", "8859_6", "8859_7", "8859_8", "8859_9", "JIS", "SJIS", "EUCJIS",
+ * "8859_5", "8859_6", "8859_7", "8859_8", "8859_9", "JIS", "SJIS", "MS932", "EUCJIS",
* "GB2312", "BIG5", "KSC5601", "ISO2022KR", "KOI8_R", "CP037", "CP277", "CP278",
* "CP280", "CP284", "CP285", "CP297", "CP420", "CP424", "CP500", "CP870", "CP871" and "CP918".
* @param encoding Case insensitive Java encoding name: <code>UTF8, 8859_1, 8859_2, 8859_3,
- * 8859_4, 8859_5, 8859_6, 8859_7, 8859_8, 8859_9, JIS, SJIS, EUCJIS,
+ * 8859_4, 8859_5, 8859_6, 8859_7, 8859_8, 8859_9, JIS, SJIS, MS932, EUCJIS,
* GB2312, BIG5, KSC5601, ISO2022KR, KOI8_R, CP037, CP277, CP278,
* CP280, CP284, CP285, CP297, CP420, CP424, CP500, CP870, CP871
* and CP918</code>.
1.21 +29 -29 xml-xerces/java/src/org/apache/xml/serialize/BaseMarkupSerializer.java
Index: BaseMarkupSerializer.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/BaseMarkupSerializer.java,v
retrieving revision 1.20
retrieving revision 1.21
diff -u -r1.20 -r1.21
--- BaseMarkupSerializer.java 2000/09/08 01:45:49 1.20
+++ BaseMarkupSerializer.java 2000/12/14 19:21:49 1.21
@@ -134,7 +134,7 @@
* another element.
*
*
- * @version $Revision: 1.20 $ $Date: 2000/09/08 01:45:49 $
+ * @version $Revision: 1.21 $ $Date: 2000/12/14 19:21:49 $
* @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
* @see Serializer
* @see DOMSerializer
@@ -145,14 +145,7 @@
{
- /**
- * Identifies the last printable character in the Unicode range
- * that is supported by the encoding used with this serializer.
- * For 8-bit encodings this will be either 0x7E or 0xFF.
- * For 16-bit encodings this will be 0xFFFF. Characters that are
- * not printable will be escaped using character references.
- */
- private int _lastPrintable = 0x7E;
+ private EncodingInfo _encodingInfo;
/**
@@ -294,8 +287,6 @@
public void setOutputByteStream( OutputStream output )
{
- String encoding;
-
if ( output == null )
throw new NullPointerException( "SER001 Argument 'output' is null." );
_output = output;
@@ -343,17 +334,12 @@
// If the output stream has been set, use it to construct
// the writer. It is possible that the serializer has been
// reused with the same output stream and different encoding.
+
+ _encodingInfo = _format.getEncodingInfo();
+
if ( _output != null ) {
- if ( _format.getEncoding() == null )
- _writer = new OutputStreamWriter( _output );
- else
- _writer = Encodings.getWriter( _output, _format.getEncoding() );
+ _writer = _encodingInfo.getWriter(_output);
}
- // Determine the last printable character.
- if ( _format.getEncoding() == null )
- _lastPrintable = Encodings.getLastPrintable();
- else
- _lastPrintable = Encodings.getLastPrintable( _format.getEncoding() );
if ( _format.getIndenting() ) {
_indenting = true;
@@ -1125,7 +1111,7 @@
* @param ch Character value
* @return Character entity name, or null
*/
- protected abstract String getEntityRef( char ch );
+ protected abstract String getEntityRef(int ch);
/**
@@ -1275,7 +1261,7 @@
}
- protected void printEscaped( char ch )
+ protected void printEscaped(int ch)
{
String charRef;
@@ -1288,16 +1274,21 @@
_printer.printText( '&' );
_printer.printText( charRef );
_printer.printText( ';' );
- } else if ( ( ch >= ' ' && ch <= _lastPrintable && ch != 0xF7 ) ||
+ } else if ( ( ch >= ' ' && _encodingInfo.isPrintable(ch) && ch != 0xF7 ) ||
ch == '\n' || ch == '\r' || ch == '\t' ) {
// If the character is not printable, print as character reference.
// Non printables are below ASCII space but not tab or line
// terminator, ASCII delete, or above a certain Unicode threshold.
- _printer.printText( ch );
+ if (ch < 0x10000) {
+ _printer.printText((char)ch );
+ } else {
+ _printer.printText((char)(((ch-0x10000)>>10)+0xd800));
+ _printer.printText((char)(((ch-0x10000)&0x3ff)+0xdc00));
+ }
} else {
- _printer.printText( "&#" );
- _printer.printText( Integer.toString( ch ) );
- _printer.printText( ';' );
+ _printer.printText("&#x");
+ _printer.printText(Integer.toHexString(ch));
+ _printer.printText(';');
}
}
@@ -1312,8 +1303,17 @@
*/
protected void printEscaped( String source )
{
- for ( int i = 0 ; i < source.length() ; ++i )
- printEscaped( source.charAt( i ) );
+ for ( int i = 0 ; i < source.length() ; ++i ) {
+ int ch = source.charAt(i);
+ if ((ch & 0xfc00) == 0xd800 && i+1 < source.length()) {
+ int lowch = source.charAt(i+1);
+ if ((lowch & 0xfc00) == 0xdc00) {
+ ch = 0x10000 + ((ch-0xd800)<<10) + lowch-0xdc00;
+ i++;
+ }
+ }
+ printEscaped(ch);
+ }
}
1.3 +38 -87 xml-xerces/java/src/org/apache/xml/serialize/Encodings.java
Index: Encodings.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/Encodings.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- Encodings.java 2000/08/30 18:59:20 1.2
+++ Encodings.java 2000/12/14 19:21:50 1.3
@@ -71,7 +71,7 @@
* to override encoding names and provide the last printable character
* for each encoding.
*
- * @version $Revision: 1.2 $ $Date: 2000/08/30 18:59:20 $
+ * @version $Id: Encodings.java,v 1.3 2000/12/14 19:21:50 lehors Exp $
* @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
*/
class Encodings
@@ -83,104 +83,55 @@
*/
static final int DefaultLastPrintable = 0x7F;
-
- /**
- * Returns a writer for the specified encoding based on
- * an output stream.
- *
- * @param output The output stream
- * @param encoding The encoding
- * @return A suitable writer
- * @throws UnsupportedEncodingException There is no convertor
- * to support this encoding
- */
- static Writer getWriter( OutputStream output, String encoding )
- throws UnsupportedEncodingException
- {
- for ( int i = 0 ; i < _encodings.length ; ++i ) {
- if ( _encodings[ i ].name.equals( encoding ) )
- return new OutputStreamWriter( output, _encodings[ i ].javaName );
- }
- return new OutputStreamWriter( output, encoding );
- }
-
-
/**
- * Returns the last printable character for the specified
- * encoding.
- *
- * @param encoding The encoding
- * @return The last printable character
+ * @param encoding a MIME charset name, or null.
*/
- static int getLastPrintable( String encoding )
- {
- for ( int i = 0 ; i < _encodings.length ; ++i ) {
- if ( _encodings[ i ].name.equalsIgnoreCase( encoding ) )
- return _encodings[ i ].lastPrintable;
+ static EncodingInfo getEncodingInfo(String encoding) {
+ if (encoding == null)
+ return new EncodingInfo(null, DefaultLastPrintable);
+ for (int i = 0; i < _encodings.length; i++) {
+ if (_encodings[i].name.equalsIgnoreCase(encoding))
+ return _encodings[i];
}
- return DefaultLastPrintable;
+ return new SieveEncodingInfo(encoding, DefaultLastPrintable);
}
+ static final String JIS_DANGER_CHARS
+ = "\\\u007e\u007f\u00a2\u00a3\u00a5\u00ac"
+ +"\u2014\u2015\u2016\u2026\u203e\u203e\u2225\u222f\u301c"
+ +"\uff3c\uff5e\uffe0\uffe1\uffe2\uffe3";
/**
- * Returns the last printable character for an unspecified
- * encoding.
- */
- static int getLastPrintable()
- {
- return DefaultLastPrintable;
- }
-
-
- /**
- * Holds information about a given encoding.
+ * Constructs a list of all the supported encodings.
*/
- static final class EncodingInfo
- {
-
+ private static final EncodingInfo[] _encodings = new EncodingInfo[] {
+ new EncodingInfo("ASCII", 0x7F),
+ new EncodingInfo("US-ASCII", 0x7F),
+ new EncodingInfo("ISO-8859-1", 0xFF),
+ new EncodingInfo("ISO-8859-2", 0xFF),
+ new EncodingInfo("ISO-8859-3", 0xFF),
+ new EncodingInfo("ISO-8859-4", 0xFF),
+ new EncodingInfo("ISO-8859-5", 0xFF),
+ new EncodingInfo("ISO-8859-6", 0xFF),
+ new EncodingInfo("ISO-8859-7", 0xFF),
+ new EncodingInfo("ISO-8859-8", 0xFF),
+ new EncodingInfo("ISO-8859-9", 0xFF),
/**
- * The encoding name.
- */
- final String name;
-
+ * Does JDK's converter supprt surrogates?
+ * A Java encoding name "UTF-8" is suppoted by JDK 1.2 or later.
+ */
+ new EncodingInfo("UTF-8", "UTF8", 0x10FFFF),
/**
- * The name used by the Java convertor.
+ * JDK 1.1 supports "Shift_JIS" as an alias of "SJIS".
+ * But JDK 1.2 treats "Shift_JIS" as an alias of "MS932".
+ * The JDK 1.2's behavior is invalid against IANA registrations.
*/
- final String javaName;
-
+ new SieveEncodingInfo("Shift_JIS", "SJIS", 0x7F, JIS_DANGER_CHARS),
/**
- * The last printable character.
+ * "MS932" is supported by JDK 1.2 or later.
*/
- final int lastPrintable;
-
- EncodingInfo( String name, String javaName, int lastPrintable )
- {
- this.name = name;
- this.javaName = javaName;
- this.lastPrintable = lastPrintable;
- }
-
- }
-
-
- /**
- * Constructs a list of all the supported encodings.
- */
- private static final EncodingInfo[] _encodings = new EncodingInfo[] {
- new EncodingInfo( "ASCII", "ASCII", 0x7F ),
- new EncodingInfo( "ISO-Latin-1", "ASCII", 0xFF ),
- new EncodingInfo( "ISO-8859-1", "ISO8859_1", 0xFF ),
- new EncodingInfo( "ISO-8859-2", "ISO8859_2", 0xFF ),
- new EncodingInfo( "ISO-8859-3", "ISO8859_3", 0xFF ),
- new EncodingInfo( "ISO-8859-4", "ISO8859_4", 0xFF ),
- new EncodingInfo( "ISO-8859-5", "ISO8859_5", 0xFF ),
- new EncodingInfo( "ISO-8859-6", "ISO8859_6", 0xFF ),
- new EncodingInfo( "ISO-8859-7", "ISO8859_7", 0xFF ),
- new EncodingInfo( "ISO-8859-8", "ISO8859_8", 0xFF ),
- new EncodingInfo( "ISO-8859-9", "ISO8859_9", 0xFF ),
- new EncodingInfo( "UTF-8", "UTF8", 0xFFFF ),
- new EncodingInfo( "UNICODE", "Unicode", 0xFFFF )
+ new SieveEncodingInfo("Windows-31J", "MS932", 0x7F, JIS_DANGER_CHARS),
+ new SieveEncodingInfo("EUC-JP", null, 0x7F, JIS_DANGER_CHARS),
+ new SieveEncodingInfo("ISO-2022-JP", null, 0x7F, JIS_DANGER_CHARS),
};
-
-
}
1.14 +2 -2 xml-xerces/java/src/org/apache/xml/serialize/HTMLSerializer.java
Index: HTMLSerializer.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/HTMLSerializer.java,v
retrieving revision 1.13
retrieving revision 1.14
diff -u -r1.13 -r1.14
--- HTMLSerializer.java 2000/08/30 18:59:21 1.13
+++ HTMLSerializer.java 2000/12/14 19:21:51 1.14
@@ -116,7 +116,7 @@
* </ul>
*
*
- * @version $Revision: 1.13 $ $Date: 2000/08/30 18:59:21 $
+ * @version $Revision: 1.14 $ $Date: 2000/12/14 19:21:51 $
* @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
* @see Serializer
*/
@@ -811,7 +811,7 @@
}
- protected String getEntityRef( char ch )
+ protected String getEntityRef(int ch)
{
return HTMLdtd.fromChar( ch );
}
1.10 +5 -2 xml-xerces/java/src/org/apache/xml/serialize/HTMLdtd.java
Index: HTMLdtd.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/HTMLdtd.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- HTMLdtd.java 2000/08/30 18:59:21 1.9
+++ HTMLdtd.java 2000/12/14 19:21:52 1.10
@@ -81,7 +81,7 @@
* first time any of these methods is called for fast and efficient access.
*
*
- * @version $Revision: 1.9 $ $Date: 2000/08/30 18:59:21 $
+ * @version $Revision: 1.10 $ $Date: 2000/12/14 19:21:52 $
* @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
*/
public final class HTMLdtd
@@ -353,8 +353,11 @@
* @param value Character value of entity
* @return Entity's name or null
*/
- public static String fromChar( char value )
+ public static String fromChar(int value )
{
+ if (value > 0xffff)
+ return null;
+
String name;
initialize();
1.5 +4 -1 xml-xerces/java/src/org/apache/xml/serialize/Makefile
Index: Makefile
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/Makefile,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- Makefile 2000/02/29 02:02:26 1.4
+++ Makefile 2000/12/14 19:21:53 1.5
@@ -14,7 +14,10 @@
ElementState.class\
HTMLdtd.class\
SerializerFactory.class\
- SerializerFactoryImpl.class
+ SerializerFactoryImpl.class \
+ EncodingInfo.class \
+ SieveEncodingInfo.class \
+ Encodings.class
DIRS =
1.10 +25 -2 xml-xerces/java/src/org/apache/xml/serialize/OutputFormat.java
Index: OutputFormat.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/OutputFormat.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- OutputFormat.java 2000/08/30 18:59:21 1.9
+++ OutputFormat.java 2000/12/14 19:21:54 1.10
@@ -91,7 +91,7 @@
* </ul>
*
*
- * @version $Revision: 1.9 $ $Date: 2000/08/30 18:59:21 $
+ * @version $Revision: 1.10 $ $Date: 2000/12/14 19:21:54 $
* @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
* <a href="mailto:visco@intalio.com">Keith Visco</a>
* @see Serializer
@@ -184,6 +184,10 @@
*/
private String _encoding = Defaults.Encoding;
+ /**
+ * The EncodingInfo instance for _encoding.
+ */
+ private EncodingInfo _encodingInfo = null;
/**
* The specified media type or null.
@@ -216,7 +220,6 @@
/**
-<<<<<<< OutputFormat.java
* Ture if comments should be ommited;
*/
private boolean _omitComments = false;
@@ -477,8 +480,28 @@
public void setEncoding( String encoding )
{
_encoding = encoding;
+ _encodingInfo = null;
+ }
+
+ /**
+ * Sets the encoding for this output method with an <code>EncodingInfo</code>
+ * instance.
+ */
+ public void setEncoding(EncodingInfo encInfo) {
+ _encoding = encInfo.getName();
+ _encodingInfo = encInfo;
}
+ /**
+ * Returns an <code>EncodingInfo<code> instance for the encoding.
+ *
+ * @see setEncoding
+ */
+ public EncodingInfo getEncodingInfo() {
+ if (_encodingInfo == null)
+ _encodingInfo = Encodings.getEncodingInfo(_encoding);
+ return _encodingInfo;
+ }
/**
* Returns the specified media type, or null.
1.8 +2 -2 xml-xerces/java/src/org/apache/xml/serialize/TextSerializer.java
Index: TextSerializer.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/TextSerializer.java,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -r1.7 -r1.8
--- TextSerializer.java 2000/08/30 18:59:22 1.7
+++ TextSerializer.java 2000/12/14 19:21:55 1.8
@@ -90,7 +90,7 @@
* org.xml.sax.DocumentHandler#endDocument}.
*
*
- * @version $Revision: 1.7 $ $Date: 2000/08/30 18:59:22 $
+ * @version $Revision: 1.8 $ $Date: 2000/12/14 19:21:55 $
* @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
* @see Serializer
*/
@@ -388,7 +388,7 @@
}
- protected String getEntityRef( char ch )
+ protected String getEntityRef( int ch )
{
return null;
}
1.17 +2 -2 xml-xerces/java/src/org/apache/xml/serialize/XMLSerializer.java
Index: XMLSerializer.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xml/serialize/XMLSerializer.java,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -r1.16 -r1.17
--- XMLSerializer.java 2000/08/30 18:59:22 1.16
+++ XMLSerializer.java 2000/12/14 19:21:56 1.17
@@ -104,7 +104,7 @@
* spaces at beginning of line will be stripped.
*
*
- * @version $Revision: 1.16 $ $Date: 2000/08/30 18:59:22 $
+ * @version $Revision: 1.17 $ $Date: 2000/12/14 19:21:56 $
* @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
* @see Serializer
*/
@@ -652,7 +652,7 @@
}
- protected String getEntityRef( char ch )
+ protected String getEntityRef(int ch)
{
// Encode special XML characters into the equivalent character references.
// These five are defined by default for all XML documents.
1.1 xml-xerces/java/src/org/apache/xml/serialize/EncodingInfo.java
Index: EncodingInfo.java
===================================================================
/*
* The Apache Software License, Version 1.1
*
*
* Copyright (c) 2000 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.apache.org. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
package org.apache.xml.serialize;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
/**
* This class represents an encoding.
*
* @version $Id: EncodingInfo.java,v 1.1 2000/12/14 19:21:50 lehors Exp $
*/
public class EncodingInfo {
String name;
String javaName;
int lastPrintable;
/**
* Creates new <code>EncodingInfo</code> instance.
*/
public EncodingInfo(String mimeName, String javaName, int lastPrintable) {
this.name = mimeName;
this.javaName = javaName == null ? mimeName : javaName;
this.lastPrintable = lastPrintable;
}
/**
* Creates new <code>EncodingInfo</code> instance.
*/
public EncodingInfo(String mimeName, int lastPrintable) {
this(mimeName, mimeName, lastPrintable);
}
/**
* Returns a MIME charset name of this encoding.
*/
public String getName() {
return this.name;
}
/**
* Returns a writer for this encoding based on
* an output stream.
*
* @return A suitable writer
* @exception UnsupportedEncodingException There is no convertor
* to support this encoding
*/
public Writer getWriter(OutputStream output)
throws UnsupportedEncodingException {
if (this.javaName == null)
return new OutputStreamWriter(output);
return new OutputStreamWriter(output, this.javaName);
}
/**
* Checks whether the specified character is printable or not.
*
* @param ch a code point (0-0x10ffff)
*/
public boolean isPrintable(int ch) {
return ch <= this.lastPrintable;
}
}
1.1 xml-xerces/java/src/org/apache/xml/serialize/SieveEncodingInfo.java
Index: SieveEncodingInfo.java
===================================================================
/*
* The Apache Software License, Version 1.1
*
*
* Copyright (c) 2000 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.apache.org. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
package org.apache.xml.serialize;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
/**
* This class represents an encoding.
*
* @version $Id: SieveEncodingInfo.java,v 1.1 2000/12/14 19:21:55 lehors Exp $
*/
public class SieveEncodingInfo extends EncodingInfo {
BAOutputStream checkerStream = null;
Writer checkerWriter = null;
String dangerChars = null;
/**
* Creates new <code>SeiveEncodingInfo</code> instance.
*
* @param dangers A sorted characters that are always printed as character references.
*/
public SieveEncodingInfo(String mimeName, String javaName,
int lastPrintable, String dangers) {
super(mimeName, javaName, lastPrintable);
this.dangerChars = dangers;
}
/**
* Creates new <code>SeiveEncodingInfo</code> instance.
*/
public SieveEncodingInfo(String mimeName, int lastPrintable) {
this(mimeName, mimeName, lastPrintable, null);
}
/**
* Checks whether the specified character is printable or not.
*
* @param ch a code point (0-0x10ffff)
*/
public boolean isPrintable(int ch) {
if (this.dangerChars != null && ch <= 0xffff) {
/**
* Searches this.dangerChars for ch.
* TODO: Use binary search.
*/
if (this.dangerChars.indexOf(ch) >= 0)
return false;
}
if (ch <= this.lastPrintable)
return true;
boolean printable = true;
synchronized (this) {
try {
if (this.checkerWriter == null) {
this.checkerStream = new BAOutputStream(10);
this.checkerWriter = new OutputStreamWriter(this.checkerStream, this.javaName);
}
if (ch > 0xffff) {
this.checkerWriter.write(((ch-0x10000)>>10)+0xd800);
this.checkerWriter.write(((ch-0x10000)&0x3ff)+0xdc00);
byte[] result = this.checkerStream.getBuffer();
if (this.checkerStream.size() == 2 && result[0] == '?' && result[1] == '?')
printable = false;
} else {
this.checkerWriter.write(ch);
this.checkerWriter.flush();
byte[] result = this.checkerStream.getBuffer();
if (this.checkerStream.size() == 1 && result[0] == '?')
printable = false;
}
this.checkerStream.reset();
} catch (IOException ioe) {
printable = false;
}
}
return printable;
}
/**
* Why don't we use the original ByteArrayOutputStream?
* - Because the toByteArray() method of the ByteArrayOutputStream
* creates new byte[] instances for each call.
*/
static class BAOutputStream extends ByteArrayOutputStream {
BAOutputStream() {
super();
}
BAOutputStream(int size) {
super(size);
}
byte[] getBuffer() {
return this.buf;
}
}
}