You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ni...@apache.org on 2010/10/04 04:59:49 UTC
svn commit: r1004090 - in /commons/proper/io/trunk/src:
java/org/apache/commons/io/input/ java/org/apache/commons/io/output/
test/org/apache/commons/io/input/ test/org/apache/commons/io/output/
Author: niallp
Date: Mon Oct 4 02:59:49 2010
New Revision: 1004090
URL: http://svn.apache.org/viewvc?rev=1004090&view=rev
Log:
IO-162 add Xml(Stream)Reader/Writer from ROME - thanks to Hervé Boutemy for the patch
Added:
commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReader.java (with props)
commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReaderException.java (with props)
commons/proper/io/trunk/src/java/org/apache/commons/io/output/XmlStreamWriter.java (with props)
commons/proper/io/trunk/src/test/org/apache/commons/io/input/XmlStreamReaderTest.java (with props)
commons/proper/io/trunk/src/test/org/apache/commons/io/output/XmlStreamWriterTest.java (with props)
Added: commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReader.java
URL: http://svn.apache.org/viewvc/commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReader.java?rev=1004090&view=auto
==============================================================================
--- commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReader.java (added)
+++ commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReader.java Mon Oct 4 02:59:49 2010
@@ -0,0 +1,703 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.io.input;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.StringReader;
+import java.net.URL;
+import java.net.URLConnection;
+import java.net.HttpURLConnection;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+import java.text.MessageFormat;
+
+/**
+ * Character stream that handles all the necessary Voodo to figure out the
+ * charset encoding of the XML document within the stream.
+ * <p>
+ * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader.
+ * This one IS a character stream.
+ * <p>
+ * All this has to be done without consuming characters from the stream, if not
+ * the XML parser will not recognized the document as a valid XML. This is not
+ * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers
+ * right now, XmlStreamReader handles it and things work in all parsers).
+ * <p>
+ * The XmlStreamReader class handles the charset encoding of XML documents in
+ * Files, raw streams and HTTP streams by offering a wide set of constructors.
+ * <p>
+ * By default the charset encoding detection is lenient, the constructor with
+ * the lenient flag can be used for an script (following HTTP MIME and XML
+ * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a
+ * href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
+ * Determining the character encoding of a feed</a>.
+ * <p>
+ * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under
+ * Apache License 2.0.
+ *
+ * @author Alejandro Abdelnur
+ * @version $Id$
+ * @see XmlStreamWriter
+ */
+public class XmlStreamReader extends Reader {
+ private static final int BUFFER_SIZE = 4096;
+
+ private static final String UTF_8 = "UTF-8";
+
+ private static final String US_ASCII = "US-ASCII";
+
+ private static final String UTF_16BE = "UTF-16BE";
+
+ private static final String UTF_16LE = "UTF-16LE";
+
+ private static final String UTF_16 = "UTF-16";
+
+ private static final String EBCDIC = "CP1047";
+
+ private static String staticDefaultEncoding = null;
+
+ private Reader reader;
+
+ private String encoding;
+
+ private String defaultEncoding;
+
+ /**
+ * Sets the default encoding to use if none is set in HTTP content-type, XML
+ * prolog and the rules based on content-type are not adequate.
+ * <p>
+ * If it is set to NULL the content-type based rules are used.
+ * <p>
+ * By default it is NULL.
+ *
+ * @param encoding charset encoding to default to.
+ */
+ public static void setDefaultEncoding(String encoding) {
+ staticDefaultEncoding = encoding;
+ }
+
+ /**
+ * Returns the default encoding to use if none is set in HTTP content-type,
+ * XML prolog and the rules based on content-type are not adequate.
+ * <p>
+ * If it is NULL the content-type based rules are used.
+ *
+ * @return the default encoding to use.
+ */
+ public static String getDefaultEncoding() {
+ return staticDefaultEncoding;
+ }
+
+ /**
+ * Creates a Reader for a File.
+ * <p>
+ * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset,
+ * if this is also missing defaults to UTF-8.
+ * <p>
+ * It does a lenient charset encoding detection, check the constructor with
+ * the lenient parameter for details.
+ *
+ * @param file File to create a Reader from.
+ * @throws IOException thrown if there is a problem reading the file.
+ */
+ public XmlStreamReader(File file) throws IOException {
+ this(new FileInputStream(file));
+ }
+
+ /**
+ * Creates a Reader for a raw InputStream.
+ * <p>
+ * It follows the same logic used for files.
+ * <p>
+ * It does a lenient charset encoding detection, check the constructor with
+ * the lenient parameter for details.
+ *
+ * @param is InputStream to create a Reader from.
+ * @throws IOException thrown if there is a problem reading the stream.
+ */
+ public XmlStreamReader(InputStream is) throws IOException {
+ this(is, true);
+ }
+
+ /**
+ * Creates a Reader for a raw InputStream.
+ * <p>
+ * It follows the same logic used for files.
+ * <p>
+ * If lenient detection is indicated and the detection above fails as per
+ * specifications it then attempts the following:
+ * <p>
+ * If the content type was 'text/html' it replaces it with 'text/xml' and
+ * tries the detection again.
+ * <p>
+ * Else if the XML prolog had a charset encoding that encoding is used.
+ * <p>
+ * Else if the content type had a charset encoding that encoding is used.
+ * <p>
+ * Else 'UTF-8' is used.
+ * <p>
+ * If lenient detection is indicated an XmlStreamReaderException is never
+ * thrown.
+ *
+ * @param is InputStream to create a Reader from.
+ * @param lenient indicates if the charset encoding detection should be
+ * relaxed.
+ * @throws IOException thrown if there is a problem reading the stream.
+ * @throws XmlStreamReaderException thrown if the charset encoding could not
+ * be determined according to the specs.
+ */
+ public XmlStreamReader(InputStream is, boolean lenient) throws IOException,
+ XmlStreamReaderException {
+ defaultEncoding = staticDefaultEncoding;
+ try {
+ doRawStream(is, lenient);
+ } catch (XmlStreamReaderException ex) {
+ if (!lenient) {
+ throw ex;
+ } else {
+ doLenientDetection(null, ex);
+ }
+ }
+ }
+
+ /**
+ * Creates a Reader using the InputStream of a URL.
+ * <p>
+ * If the URL is not of type HTTP and there is not 'content-type' header in
+ * the fetched data it uses the same logic used for Files.
+ * <p>
+ * If the URL is a HTTP Url or there is a 'content-type' header in the
+ * fetched data it uses the same logic used for an InputStream with
+ * content-type.
+ * <p>
+ * It does a lenient charset encoding detection, check the constructor with
+ * the lenient parameter for details.
+ *
+ * @param url URL to create a Reader from.
+ * @throws IOException thrown if there is a problem reading the stream of
+ * the URL.
+ */
+ public XmlStreamReader(URL url) throws IOException {
+ this(url.openConnection());
+ }
+
+ /**
+ * Creates a Reader using the InputStream of a URLConnection.
+ * <p>
+ * If the URLConnection is not of type HttpURLConnection and there is not
+ * 'content-type' header in the fetched data it uses the same logic used for
+ * files.
+ * <p>
+ * If the URLConnection is a HTTP Url or there is a 'content-type' header in
+ * the fetched data it uses the same logic used for an InputStream with
+ * content-type.
+ * <p>
+ * It does a lenient charset encoding detection, check the constructor with
+ * the lenient parameter for details.
+ *
+ * @param conn URLConnection to create a Reader from.
+ * @throws IOException thrown if there is a problem reading the stream of
+ * the URLConnection.
+ */
+ public XmlStreamReader(URLConnection conn) throws IOException {
+ defaultEncoding = staticDefaultEncoding;
+ boolean lenient = true;
+ if (conn instanceof HttpURLConnection) {
+ try {
+ doHttpStream(conn.getInputStream(), conn.getContentType(),
+ lenient);
+ } catch (XmlStreamReaderException ex) {
+ doLenientDetection(conn.getContentType(), ex);
+ }
+ } else if (conn.getContentType() != null) {
+ try {
+ doHttpStream(conn.getInputStream(), conn.getContentType(),
+ lenient);
+ } catch (XmlStreamReaderException ex) {
+ doLenientDetection(conn.getContentType(), ex);
+ }
+ } else {
+ try {
+ doRawStream(conn.getInputStream(), lenient);
+ } catch (XmlStreamReaderException ex) {
+ doLenientDetection(null, ex);
+ }
+ }
+ }
+
+ /**
+ * Creates a Reader using an InputStream an the associated content-type
+ * header.
+ * <p>
+ * First it checks if the stream has BOM. If there is not BOM checks the
+ * content-type encoding. If there is not content-type encoding checks the
+ * XML prolog encoding. If there is not XML prolog encoding uses the default
+ * encoding mandated by the content-type MIME type.
+ * <p>
+ * It does a lenient charset encoding detection, check the constructor with
+ * the lenient parameter for details.
+ *
+ * @param is InputStream to create the reader from.
+ * @param httpContentType content-type header to use for the resolution of
+ * the charset encoding.
+ * @throws IOException thrown if there is a problem reading the file.
+ */
+ public XmlStreamReader(InputStream is, String httpContentType)
+ throws IOException {
+ this(is, httpContentType, true);
+ }
+
+ /**
+ * Creates a Reader using an InputStream an the associated content-type
+ * header. This constructor is lenient regarding the encoding detection.
+ * <p>
+ * First it checks if the stream has BOM. If there is not BOM checks the
+ * content-type encoding. If there is not content-type encoding checks the
+ * XML prolog encoding. If there is not XML prolog encoding uses the default
+ * encoding mandated by the content-type MIME type.
+ * <p>
+ * If lenient detection is indicated and the detection above fails as per
+ * specifications it then attempts the following:
+ * <p>
+ * If the content type was 'text/html' it replaces it with 'text/xml' and
+ * tries the detection again.
+ * <p>
+ * Else if the XML prolog had a charset encoding that encoding is used.
+ * <p>
+ * Else if the content type had a charset encoding that encoding is used.
+ * <p>
+ * Else 'UTF-8' is used.
+ * <p>
+ * If lenient detection is indicated an XmlStreamReaderException is never
+ * thrown.
+ *
+ * @param is InputStream to create the reader from.
+ * @param httpContentType content-type header to use for the resolution of
+ * the charset encoding.
+ * @param lenient indicates if the charset encoding detection should be
+ * relaxed.
+ * @throws IOException thrown if there is a problem reading the file.
+ * @throws XmlStreamReaderException thrown if the charset encoding could not
+ * be determined according to the specs.
+ */
+ public XmlStreamReader(InputStream is, String httpContentType,
+ boolean lenient, String defaultEncoding) throws IOException,
+ XmlStreamReaderException {
+ this.defaultEncoding = (defaultEncoding == null) ? staticDefaultEncoding
+ : defaultEncoding;
+ try {
+ doHttpStream(is, httpContentType, lenient);
+ } catch (XmlStreamReaderException ex) {
+ if (!lenient) {
+ throw ex;
+ } else {
+ doLenientDetection(httpContentType, ex);
+ }
+ }
+ }
+
+ /**
+ * Creates a Reader using an InputStream an the associated content-type
+ * header. This constructor is lenient regarding the encoding detection.
+ * <p>
+ * First it checks if the stream has BOM. If there is not BOM checks the
+ * content-type encoding. If there is not content-type encoding checks the
+ * XML prolog encoding. If there is not XML prolog encoding uses the default
+ * encoding mandated by the content-type MIME type.
+ * <p>
+ * If lenient detection is indicated and the detection above fails as per
+ * specifications it then attempts the following:
+ * <p>
+ * If the content type was 'text/html' it replaces it with 'text/xml' and
+ * tries the detection again.
+ * <p>
+ * Else if the XML prolog had a charset encoding that encoding is used.
+ * <p>
+ * Else if the content type had a charset encoding that encoding is used.
+ * <p>
+ * Else 'UTF-8' is used.
+ * <p>
+ * If lenient detection is indicated an XmlStreamReaderException is never
+ * thrown.
+ *
+ * @param is InputStream to create the reader from.
+ * @param httpContentType content-type header to use for the resolution of
+ * the charset encoding.
+ * @param lenient indicates if the charset encoding detection should be
+ * relaxed.
+ * @throws IOException thrown if there is a problem reading the file.
+ * @throws XmlStreamReaderException thrown if the charset encoding could not
+ * be determined according to the specs.
+ */
+ public XmlStreamReader(InputStream is, String httpContentType,
+ boolean lenient) throws IOException, XmlStreamReaderException {
+ this(is, httpContentType, lenient, null);
+ }
+
+ private void doLenientDetection(String httpContentType,
+ XmlStreamReaderException ex) throws IOException {
+ if (httpContentType != null) {
+ if (httpContentType.startsWith("text/html")) {
+ httpContentType = httpContentType.substring("text/html"
+ .length());
+ httpContentType = "text/xml" + httpContentType;
+ try {
+ doHttpStream(ex.getInputStream(), httpContentType, true);
+ ex = null;
+ } catch (XmlStreamReaderException ex2) {
+ ex = ex2;
+ }
+ }
+ }
+ if (ex != null) {
+ String encoding = ex.getXmlEncoding();
+ if (encoding == null) {
+ encoding = ex.getContentTypeEncoding();
+ }
+ if (encoding == null) {
+ encoding = (defaultEncoding == null) ? UTF_8 : defaultEncoding;
+ }
+ prepareReader(ex.getInputStream(), encoding);
+ }
+ }
+
+ /**
+ * Returns the charset encoding of the XmlStreamReader.
+ *
+ * @return charset encoding.
+ */
+ public String getEncoding() {
+ return encoding;
+ }
+
+ public int read(char[] buf, int offset, int len) throws IOException {
+ return reader.read(buf, offset, len);
+ }
+
+ /**
+ * Closes the XmlStreamReader stream.
+ *
+ * @throws IOException thrown if there was a problem closing the stream.
+ */
+ public void close() throws IOException {
+ reader.close();
+ }
+
+ private void doRawStream(InputStream is, boolean lenient)
+ throws IOException {
+ BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
+ String bomEnc = getBOMEncoding(pis);
+ String xmlGuessEnc = getXMLGuessEncoding(pis);
+ String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
+ String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
+ prepareReader(pis, encoding);
+ }
+
+ private void doHttpStream(InputStream is, String httpContentType,
+ boolean lenient) throws IOException {
+ BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
+ String cTMime = getContentTypeMime(httpContentType);
+ String cTEnc = getContentTypeEncoding(httpContentType);
+ String bomEnc = getBOMEncoding(pis);
+ String xmlGuessEnc = getXMLGuessEncoding(pis);
+ String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
+ String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc,
+ xmlGuessEnc, xmlEnc, pis, lenient);
+ prepareReader(pis, encoding);
+ }
+
+ private void prepareReader(InputStream is, String encoding)
+ throws IOException {
+ reader = new InputStreamReader(is, encoding);
+ this.encoding = encoding;
+ }
+
+ // InputStream is passed for XmlStreamReaderException creation only
+ private String calculateRawEncoding(String bomEnc, String xmlGuessEnc,
+ String xmlEnc, InputStream is) throws IOException {
+ String encoding;
+ if (bomEnc == null) {
+ if (xmlGuessEnc == null || xmlEnc == null) {
+ encoding = (defaultEncoding == null) ? UTF_8 : defaultEncoding;
+ } else if (xmlEnc.equals(UTF_16)
+ && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc
+ .equals(UTF_16LE))) {
+ encoding = xmlGuessEnc;
+ } else {
+ encoding = xmlEnc;
+ }
+ } else if (bomEnc.equals(UTF_8)) {
+ if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
+ throw new XmlStreamReaderException(RAW_EX_1
+ .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }),
+ bomEnc, xmlGuessEnc, xmlEnc, is);
+ }
+ if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
+ throw new XmlStreamReaderException(RAW_EX_1
+ .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }),
+ bomEnc, xmlGuessEnc, xmlEnc, is);
+ }
+ encoding = UTF_8;
+ } else if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
+ if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
+ throw new IOException(RAW_EX_1.format(new Object[] { bomEnc,
+ xmlGuessEnc, xmlEnc }));
+ }
+ if (xmlEnc != null && !xmlEnc.equals(UTF_16)
+ && !xmlEnc.equals(bomEnc)) {
+ throw new XmlStreamReaderException(RAW_EX_1
+ .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }),
+ bomEnc, xmlGuessEnc, xmlEnc, is);
+ }
+ encoding = bomEnc;
+ } else {
+ throw new XmlStreamReaderException(RAW_EX_2.format(new Object[] {
+ bomEnc, xmlGuessEnc, xmlEnc }), bomEnc, xmlGuessEnc,
+ xmlEnc, is);
+ }
+ return encoding;
+ }
+
+ // InputStream is passed for XmlStreamReaderException creation only
+ private String calculateHttpEncoding(String cTMime, String cTEnc,
+ String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is,
+ boolean lenient) throws IOException {
+ String encoding;
+ if (lenient & xmlEnc != null) {
+ encoding = xmlEnc;
+ } else {
+ boolean appXml = isAppXml(cTMime);
+ boolean textXml = isTextXml(cTMime);
+ if (appXml || textXml) {
+ if (cTEnc == null) {
+ if (appXml) {
+ encoding = calculateRawEncoding(bomEnc, xmlGuessEnc,
+ xmlEnc, is);
+ } else {
+ encoding = (defaultEncoding == null) ? US_ASCII
+ : defaultEncoding;
+ }
+ } else if (bomEnc != null
+ && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
+ throw new XmlStreamReaderException(HTTP_EX_1
+ .format(new Object[] { cTMime, cTEnc, bomEnc,
+ xmlGuessEnc, xmlEnc }), cTMime, cTEnc,
+ bomEnc, xmlGuessEnc, xmlEnc, is);
+ } else if (cTEnc.equals(UTF_16)) {
+ if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
+ encoding = bomEnc;
+ } else {
+ throw new XmlStreamReaderException(HTTP_EX_2
+ .format(new Object[] { cTMime, cTEnc, bomEnc,
+ xmlGuessEnc, xmlEnc }), cTMime, cTEnc,
+ bomEnc, xmlGuessEnc, xmlEnc, is);
+ }
+ } else {
+ encoding = cTEnc;
+ }
+ } else {
+ throw new XmlStreamReaderException(HTTP_EX_3
+ .format(new Object[] { cTMime, cTEnc, bomEnc,
+ xmlGuessEnc, xmlEnc }), cTMime, cTEnc, bomEnc,
+ xmlGuessEnc, xmlEnc, is);
+ }
+ }
+ return encoding;
+ }
+
+ // returns MIME type or NULL if httpContentType is NULL
+ private static String getContentTypeMime(String httpContentType) {
+ String mime = null;
+ if (httpContentType != null) {
+ int i = httpContentType.indexOf(";");
+ mime = ((i == -1) ? httpContentType : httpContentType.substring(0,
+ i)).trim();
+ }
+ return mime;
+ }
+
+ private static final Pattern CHARSET_PATTERN = Pattern
+ .compile("charset=[\"']?([.[^; \"']]*)[\"']?");
+
+ // returns charset parameter value, NULL if not present, NULL if
+ // httpContentType is NULL
+ private static String getContentTypeEncoding(String httpContentType) {
+ String encoding = null;
+ if (httpContentType != null) {
+ int i = httpContentType.indexOf(";");
+ if (i > -1) {
+ String postMime = httpContentType.substring(i + 1);
+ Matcher m = CHARSET_PATTERN.matcher(postMime);
+ encoding = (m.find()) ? m.group(1) : null;
+ encoding = (encoding != null) ? encoding.toUpperCase() : null;
+ }
+ }
+ return encoding;
+ }
+
+ // returns the BOM in the stream, NULL if not present,
+ // if there was BOM the in the stream it is consumed
+ private static String getBOMEncoding(BufferedInputStream is)
+ throws IOException {
+ String encoding = null;
+ int[] bytes = new int[3];
+ is.mark(3);
+ bytes[0] = is.read();
+ bytes[1] = is.read();
+ bytes[2] = is.read();
+
+ if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
+ encoding = UTF_16BE;
+ is.reset();
+ is.read();
+ is.read();
+ } else if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
+ encoding = UTF_16LE;
+ is.reset();
+ is.read();
+ is.read();
+ } else if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
+ encoding = UTF_8;
+ } else {
+ is.reset();
+ }
+ return encoding;
+ }
+
+ // returns the best guess for the encoding by looking the first bytes of the
+ // stream, '<?'
+ private static String getXMLGuessEncoding(BufferedInputStream is)
+ throws IOException {
+ String encoding = null;
+ int[] bytes = new int[4];
+ is.mark(4);
+ bytes[0] = is.read();
+ bytes[1] = is.read();
+ bytes[2] = is.read();
+ bytes[3] = is.read();
+ is.reset();
+
+ if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00
+ && bytes[3] == 0x3F) {
+ encoding = UTF_16BE;
+ } else if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F
+ && bytes[3] == 0x00) {
+ encoding = UTF_16LE;
+ } else if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78
+ && bytes[3] == 0x6D) {
+ encoding = UTF_8;
+ } else if (bytes[0] == 0x4C && bytes[1] == 0x6F && bytes[2] == 0xA7
+ && bytes[3] == 0x94) {
+ encoding = EBCDIC;
+ }
+ return encoding;
+ }
+
+ public static final Pattern ENCODING_PATTERN = Pattern.compile(
+ "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))",
+ Pattern.MULTILINE);
+
+ // returns the encoding declared in the <?xml encoding=...?>, NULL if none
+ private static String getXmlProlog(BufferedInputStream is, String guessedEnc)
+ throws IOException {
+ String encoding = null;
+ if (guessedEnc != null) {
+ byte[] bytes = new byte[BUFFER_SIZE];
+ is.mark(BUFFER_SIZE);
+ int offset = 0;
+ int max = BUFFER_SIZE;
+ int c = is.read(bytes, offset, max);
+ int firstGT = -1;
+ String xmlProlog = null;
+ while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) {
+ offset += c;
+ max -= c;
+ c = is.read(bytes, offset, max);
+ xmlProlog = new String(bytes, 0, offset, guessedEnc);
+ firstGT = xmlProlog.indexOf('>');
+ }
+ if (firstGT == -1) {
+ if (c == -1) {
+ throw new IOException("Unexpected end of XML stream");
+ } else {
+ throw new IOException(
+ "XML prolog or ROOT element not found on first "
+ + offset + " bytes");
+ }
+ }
+ int bytesRead = offset;
+ if (bytesRead > 0) {
+ is.reset();
+ BufferedReader bReader = new BufferedReader(new StringReader(
+ xmlProlog.substring(0, firstGT + 1)));
+ StringBuffer prolog = new StringBuffer();
+ String line = bReader.readLine();
+ while (line != null) {
+ prolog.append(line);
+ line = bReader.readLine();
+ }
+ Matcher m = ENCODING_PATTERN.matcher(prolog);
+ if (m.find()) {
+ encoding = m.group(1).toUpperCase();
+ encoding = encoding.substring(1, encoding.length() - 1);
+ }
+ }
+ }
+ return encoding;
+ }
+
+ // indicates if the MIME type belongs to the APPLICATION XML family
+ private static boolean isAppXml(String mime) {
+ return mime != null
+ && (mime.equals("application/xml")
+ || mime.equals("application/xml-dtd")
+ || mime
+ .equals("application/xml-external-parsed-entity") || (mime
+ .startsWith("application/") && mime.endsWith("+xml")));
+ }
+
+ // indicates if the MIME type belongs to the TEXT XML family
+ private static boolean isTextXml(String mime) {
+ return mime != null
+ && (mime.equals("text/xml")
+ || mime.equals("text/xml-external-parsed-entity") || (mime
+ .startsWith("text/") && mime.endsWith("+xml")));
+ }
+
+ private static final MessageFormat RAW_EX_1 = new MessageFormat(
+ "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
+
+ private static final MessageFormat RAW_EX_2 = new MessageFormat(
+ "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
+
+ private static final MessageFormat HTTP_EX_1 = new MessageFormat(
+ "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
+
+ private static final MessageFormat HTTP_EX_2 = new MessageFormat(
+ "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
+
+ private static final MessageFormat HTTP_EX_3 = new MessageFormat(
+ "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
+
+}
Propchange: commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReader.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReader.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReaderException.java
URL: http://svn.apache.org/viewvc/commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReaderException.java?rev=1004090&view=auto
==============================================================================
--- commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReaderException.java (added)
+++ commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReaderException.java Mon Oct 4 02:59:49 2010
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.io.input;
+
+import java.io.InputStream;
+import java.io.IOException;
+
+/**
+ * The XmlStreamReaderException is thrown by the XmlStreamReader constructors if
+ * the charset encoding can not be determined according to the XML 1.0
+ * specification and RFC 3023.
+ * <p>
+ * The exception returns the unconsumed InputStream to allow the application to
+ * do an alternate processing with the stream. Note that the original
+ * InputStream given to the XmlStreamReader cannot be used as that one has been
+ * already read.
+ *
+ * @author Alejandro Abdelnur
+ * @version $Id$
+ */
+public class XmlStreamReaderException extends IOException {
+ private final String bomEncoding;
+
+ private final String xmlGuessEncoding;
+
+ private final String xmlEncoding;
+
+ private final String contentTypeMime;
+
+ private final String contentTypeEncoding;
+
+ private final InputStream is;
+
+ /**
+ * Creates an exception instance if the charset encoding could not be
+ * determined.
+ * <p>
+ * Instances of this exception are thrown by the XmlStreamReader.
+ *
+ * @param msg message describing the reason for the exception.
+ * @param bomEnc BOM encoding.
+ * @param xmlGuessEnc XML guess encoding.
+ * @param xmlEnc XML prolog encoding.
+ * @param is the unconsumed InputStream.
+ */
+ public XmlStreamReaderException(String msg, String bomEnc,
+ String xmlGuessEnc, String xmlEnc, InputStream is) {
+ this(msg, null, null, bomEnc, xmlGuessEnc, xmlEnc, is);
+ }
+
+ /**
+ * Creates an exception instance if the charset encoding could not be
+ * determined.
+ * <p>
+ * Instances of this exception are thrown by the XmlStreamReader.
+ *
+ * @param msg message describing the reason for the exception.
+ * @param ctMime MIME type in the content-type.
+ * @param ctEnc encoding in the content-type.
+ * @param bomEnc BOM encoding.
+ * @param xmlGuessEnc XML guess encoding.
+ * @param xmlEnc XML prolog encoding.
+ * @param is the unconsumed InputStream.
+ */
+ public XmlStreamReaderException(String msg, String ctMime, String ctEnc,
+ String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is) {
+ super(msg);
+ contentTypeMime = ctMime;
+ contentTypeEncoding = ctEnc;
+ bomEncoding = bomEnc;
+ xmlGuessEncoding = xmlGuessEnc;
+ xmlEncoding = xmlEnc;
+ this.is = is;
+ }
+
+ /**
+ * Returns the BOM encoding found in the InputStream.
+ *
+ * @return the BOM encoding, null if none.
+ */
+ public String getBomEncoding() {
+ return bomEncoding;
+ }
+
+ /**
+ * Returns the encoding guess based on the first bytes of the InputStream.
+ *
+ * @return the encoding guess, null if it couldn't be guessed.
+ */
+ public String getXmlGuessEncoding() {
+ return xmlGuessEncoding;
+ }
+
+ /**
+ * Returns the encoding found in the XML prolog of the InputStream.
+ *
+ * @return the encoding of the XML prolog, null if none.
+ */
+ public String getXmlEncoding() {
+ return xmlEncoding;
+ }
+
+ /**
+ * Returns the MIME type in the content-type used to attempt determining the
+ * encoding.
+ *
+ * @return the MIME type in the content-type, null if there was not
+ * content-type or the encoding detection did not involve HTTP.
+ */
+ public String getContentTypeMime() {
+ return contentTypeMime;
+ }
+
+ /**
+ * Returns the encoding in the content-type used to attempt determining the
+ * encoding.
+ *
+ * @return the encoding in the content-type, null if there was not
+ * content-type, no encoding in it or the encoding detection did not
+ * involve HTTP.
+ */
+ public String getContentTypeEncoding() {
+ return contentTypeEncoding;
+ }
+
+ /**
+ * Returns the unconsumed InputStream to allow the application to do an
+ * alternate encoding detection on the InputStream.
+ *
+ * @return the unconsumed InputStream.
+ */
+ public InputStream getInputStream() {
+ return is;
+ }
+}
Propchange: commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReaderException.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReaderException.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: commons/proper/io/trunk/src/java/org/apache/commons/io/output/XmlStreamWriter.java
URL: http://svn.apache.org/viewvc/commons/proper/io/trunk/src/java/org/apache/commons/io/output/XmlStreamWriter.java?rev=1004090&view=auto
==============================================================================
--- commons/proper/io/trunk/src/java/org/apache/commons/io/output/XmlStreamWriter.java (added)
+++ commons/proper/io/trunk/src/java/org/apache/commons/io/output/XmlStreamWriter.java Mon Oct 4 02:59:49 2010
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.io.output;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.StringWriter;
+import java.io.Writer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.input.XmlStreamReader;
+
+/**
+ * Character stream that handles all the necessary Voodo to figure out the
+ * charset encoding of the XML document written to the stream.
+ *
+ * @author Herve Boutemy
+ * @version $Id$
+ * @see XmlStreamReader
+ */
+public class XmlStreamWriter extends Writer {
+ private static final int BUFFER_SIZE = 4096;
+
+ private StringWriter xmlPrologWriter = new StringWriter(BUFFER_SIZE);
+
+ private OutputStream out;
+
+ private Writer writer;
+
+ private String encoding;
+
+ public XmlStreamWriter(OutputStream out) {
+ this.out = out;
+ }
+
+ public XmlStreamWriter(File file) throws FileNotFoundException {
+ this(new FileOutputStream(file));
+ }
+
+ public String getEncoding() {
+ return encoding;
+ }
+
+ public void close() throws IOException {
+ if (writer == null) {
+ encoding = "UTF-8";
+ writer = new OutputStreamWriter(out, encoding);
+ writer.write(xmlPrologWriter.toString());
+ }
+ writer.close();
+ }
+
+ public void flush() throws IOException {
+ if (writer != null) {
+ writer.flush();
+ }
+ }
+
+ private void detectEncoding(char[] cbuf, int off, int len)
+ throws IOException {
+ int size = len;
+ StringBuffer xmlProlog = xmlPrologWriter.getBuffer();
+ if (xmlProlog.length() + len > BUFFER_SIZE) {
+ size = BUFFER_SIZE - xmlProlog.length();
+ }
+ xmlPrologWriter.write(cbuf, off, size);
+
+ // try to determine encoding
+ if (xmlProlog.length() >= 5) {
+ if (xmlProlog.substring(0, 5).equals("<?xml")) {
+ // try to extract encoding from XML prolog
+ int xmlPrologEnd = xmlProlog.indexOf("?>");
+ if (xmlPrologEnd > 0) {
+ // ok, full XML prolog written: let's extract encoding
+ Matcher m = ENCODING_PATTERN.matcher(xmlProlog.substring(0,
+ xmlPrologEnd));
+ if (m.find()) {
+ encoding = m.group(1).toUpperCase();
+ encoding = encoding.substring(1, encoding.length() - 1);
+ } else {
+ // no encoding found in XML prolog: using default
+ // encoding
+ encoding = "UTF-8";
+ }
+ } else {
+ if (xmlProlog.length() >= BUFFER_SIZE) {
+ // no encoding found in first characters: using default
+ // encoding
+ encoding = "UTF-8";
+ }
+ }
+ } else {
+ // no XML prolog: using default encoding
+ encoding = "UTF-8";
+ }
+ if (encoding != null) {
+ // encoding has been chosen: let's do it
+ xmlPrologWriter = null;
+ writer = new OutputStreamWriter(out, encoding);
+ writer.write(xmlProlog.toString());
+ if (len > size) {
+ writer.write(cbuf, off + size, len - size);
+ }
+ }
+ }
+ }
+
+ public void write(char[] cbuf, int off, int len) throws IOException {
+ if (xmlPrologWriter != null) {
+ detectEncoding(cbuf, off, len);
+ } else {
+ writer.write(cbuf, off, len);
+ }
+ }
+
+ static final Pattern ENCODING_PATTERN = XmlStreamReader.ENCODING_PATTERN;
+}
Propchange: commons/proper/io/trunk/src/java/org/apache/commons/io/output/XmlStreamWriter.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: commons/proper/io/trunk/src/java/org/apache/commons/io/output/XmlStreamWriter.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: commons/proper/io/trunk/src/test/org/apache/commons/io/input/XmlStreamReaderTest.java
URL: http://svn.apache.org/viewvc/commons/proper/io/trunk/src/test/org/apache/commons/io/input/XmlStreamReaderTest.java?rev=1004090&view=auto
==============================================================================
--- commons/proper/io/trunk/src/test/org/apache/commons/io/input/XmlStreamReaderTest.java (added)
+++ commons/proper/io/trunk/src/test/org/apache/commons/io/input/XmlStreamReaderTest.java Mon Oct 4 02:59:49 2010
@@ -0,0 +1,360 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.io.input;
+
+import java.io.*;
+import java.text.MessageFormat;
+import java.util.HashMap;
+import java.util.Map;
+
+import junit.framework.TestCase;
+
+/**
+ * @author Alejandro Abdelnur
+ */
+public class XmlStreamReaderTest extends TestCase {
+ private static final String XML5 = "xml-prolog-encoding-spaced-single-quotes";
+ private static final String XML4 = "xml-prolog-encoding-single-quotes";
+ private static final String XML3 = "xml-prolog-encoding-double-quotes";
+ private static final String XML2 = "xml-prolog";
+ private static final String XML1 = "xml";
+
+ protected void _testRawNoBomValid(String encoding) throws Exception {
+ InputStream is = getXmlStream("no-bom", XML1, encoding, encoding);
+ XmlStreamReader xmlReader = new XmlStreamReader(is, false);
+ assertEquals(xmlReader.getEncoding(), "UTF-8");
+
+ is = getXmlStream("no-bom", XML2, encoding, encoding);
+ xmlReader = new XmlStreamReader(is);
+ assertEquals(xmlReader.getEncoding(), "UTF-8");
+
+ is = getXmlStream("no-bom", XML3, encoding, encoding);
+ xmlReader = new XmlStreamReader(is);
+ assertEquals(xmlReader.getEncoding(), encoding);
+
+ is = getXmlStream("no-bom", XML4, encoding, encoding);
+ xmlReader = new XmlStreamReader(is);
+ assertEquals(xmlReader.getEncoding(), encoding);
+
+ is = getXmlStream("no-bom", XML5, encoding, encoding);
+ xmlReader = new XmlStreamReader(is);
+ assertEquals(xmlReader.getEncoding(), encoding);
+ }
+
+ protected void _testRawNoBomInvalid(String encoding) throws Exception {
+ InputStream is = getXmlStream("no-bom", XML3, encoding, encoding);
+ try {
+ new XmlStreamReader(is, false);
+ fail("It should have failed");
+ } catch (IOException ex) {
+ assertTrue(ex.getMessage().indexOf("Invalid encoding,") > -1);
+ }
+ }
+
+ public void testRawNoBom() throws Exception {
+ _testRawNoBomValid("US-ASCII");
+ _testRawNoBomValid("UTF-8");
+ _testRawNoBomValid("ISO-8859-1");
+ _testRawNoBomValid("CP1047");
+ }
+
+ protected void _testRawBomValid(String encoding) throws Exception {
+ InputStream is = getXmlStream(encoding + "-bom", XML3, encoding,
+ encoding);
+ XmlStreamReader xmlReader = new XmlStreamReader(is, false);
+ if (!encoding.equals("UTF-16")) {
+ assertEquals(xmlReader.getEncoding(), encoding);
+ } else {
+ assertEquals(xmlReader.getEncoding()
+ .substring(0, encoding.length()), encoding);
+ }
+ }
+
+ protected void _testRawBomInvalid(String bomEnc, String streamEnc,
+ String prologEnc) throws Exception {
+ InputStream is = getXmlStream(bomEnc, XML3, streamEnc, prologEnc);
+ try {
+ XmlStreamReader xmlReader = new XmlStreamReader(is, false);
+ String foundEnc = xmlReader.getEncoding();
+ fail("It should have failed for BOM " + bomEnc + ", streamEnc "
+ + streamEnc + " and prologEnc " + prologEnc + ": found "
+ + foundEnc);
+ } catch (IOException ex) {
+ assertTrue(ex.getMessage().indexOf("Invalid encoding,") > -1);
+ }
+ }
+
+ public void testRawBom() throws Exception {
+ _testRawBomValid("UTF-8");
+ _testRawBomValid("UTF-16BE");
+ _testRawBomValid("UTF-16LE");
+ _testRawBomValid("UTF-16");
+
+ _testRawBomInvalid("UTF-8-bom", "US-ASCII", "US-ASCII");
+ _testRawBomInvalid("UTF-8-bom", "ISO-8859-1", "ISO-8859-1");
+ _testRawBomInvalid("UTF-8-bom", "UTF-8", "UTF-16");
+ _testRawBomInvalid("UTF-8-bom", "UTF-8", "UTF-16BE");
+ _testRawBomInvalid("UTF-8-bom", "UTF-8", "UTF-16LE");
+ _testRawBomInvalid("UTF-16BE-bom", "UTF-16BE", "UTF-16LE");
+ _testRawBomInvalid("UTF-16LE-bom", "UTF-16LE", "UTF-16BE");
+ _testRawBomInvalid("UTF-16LE-bom", "UTF-16LE", "UTF-8");
+ }
+
+ public void testHttp() throws Exception {
+ _testHttpValid("application/xml", "no-bom", "US-ASCII", null);
+ _testHttpValid("application/xml", "UTF-8-bom", "US-ASCII", null);
+ _testHttpValid("application/xml", "UTF-8-bom", "UTF-8", null);
+ _testHttpValid("application/xml", "UTF-8-bom", "UTF-8", "UTF-8");
+ _testHttpValid("application/xml;charset=UTF-8", "UTF-8-bom", "UTF-8",
+ null);
+ _testHttpValid("application/xml;charset=\"UTF-8\"", "UTF-8-bom",
+ "UTF-8", null);
+ _testHttpValid("application/xml;charset='UTF-8'", "UTF-8-bom", "UTF-8",
+ null);
+ _testHttpValid("application/xml;charset=UTF-8", "UTF-8-bom", "UTF-8",
+ "UTF-8");
+ _testHttpValid("application/xml;charset=UTF-16", "UTF-16BE-bom",
+ "UTF-16BE", null);
+ _testHttpValid("application/xml;charset=UTF-16", "UTF-16BE-bom",
+ "UTF-16BE", "UTF-16");
+ _testHttpValid("application/xml;charset=UTF-16", "UTF-16BE-bom",
+ "UTF-16BE", "UTF-16BE");
+
+ _testHttpInvalid("application/xml;charset=UTF-16BE", "UTF-16BE-bom",
+ "UTF-16BE", null);
+ _testHttpInvalid("application/xml;charset=UTF-16BE", "UTF-16BE-bom",
+ "UTF-16BE", "UTF-16");
+ _testHttpInvalid("application/xml;charset=UTF-16BE", "UTF-16BE-bom",
+ "UTF-16BE", "UTF-16BE");
+ _testHttpInvalid("application/xml", "UTF-8-bom", "US-ASCII", "US-ASCII");
+ _testHttpInvalid("application/xml;charset=UTF-16", "UTF-16LE", "UTF-8",
+ "UTF-8");
+ _testHttpInvalid("application/xml;charset=UTF-16", "no-bom",
+ "UTF-16BE", "UTF-16BE");
+
+ _testHttpValid("text/xml", "no-bom", "US-ASCII", null);
+ _testHttpValid("text/xml;charset=UTF-8", "UTF-8-bom", "UTF-8", "UTF-8");
+ _testHttpValid("text/xml;charset=UTF-8", "UTF-8-bom", "UTF-8", null);
+ _testHttpValid("text/xml;charset=UTF-16", "UTF-16BE-bom", "UTF-16BE",
+ null);
+ _testHttpValid("text/xml;charset=UTF-16", "UTF-16BE-bom", "UTF-16BE",
+ "UTF-16");
+ _testHttpValid("text/xml;charset=UTF-16", "UTF-16BE-bom", "UTF-16BE",
+ "UTF-16BE");
+ _testHttpValid("text/xml", "UTF-8-bom", "US-ASCII", null);
+
+ _testAlternateDefaultEncoding("application/xml", "UTF-8-bom", "UTF-8",
+ null, null);
+ _testAlternateDefaultEncoding("application/xml", "no-bom", "US-ASCII",
+ null, "US-ASCII");
+ _testAlternateDefaultEncoding("application/xml", "UTF-8-bom", "UTF-8",
+ null, "UTF-8");
+ _testAlternateDefaultEncoding("text/xml", "no-bom", "US-ASCII", null,
+ null);
+ _testAlternateDefaultEncoding("text/xml", "no-bom", "US-ASCII", null,
+ "US-ASCII");
+ _testAlternateDefaultEncoding("text/xml", "no-bom", "US-ASCII", null,
+ "UTF-8");
+
+ _testHttpInvalid("text/xml;charset=UTF-16BE", "UTF-16BE-bom",
+ "UTF-16BE", null);
+ _testHttpInvalid("text/xml;charset=UTF-16BE", "UTF-16BE-bom",
+ "UTF-16BE", "UTF-16");
+ _testHttpInvalid("text/xml;charset=UTF-16BE", "UTF-16BE-bom",
+ "UTF-16BE", "UTF-16BE");
+ _testHttpInvalid("text/xml;charset=UTF-16", "no-bom", "UTF-16BE",
+ "UTF-16BE");
+ _testHttpInvalid("text/xml;charset=UTF-16", "no-bom", "UTF-16BE", null);
+
+ _testHttpLenient("text/xml", "no-bom", "US-ASCII", null, "US-ASCII");
+ _testHttpLenient("text/xml;charset=UTF-8", "UTF-8-bom", "UTF-8",
+ "UTF-8", "UTF-8");
+ _testHttpLenient("text/xml;charset=UTF-8", "UTF-8-bom", "UTF-8", null,
+ "UTF-8");
+ _testHttpLenient("text/xml;charset=UTF-16", "UTF-16BE-bom", "UTF-16BE",
+ null, "UTF-16BE");
+ _testHttpLenient("text/xml;charset=UTF-16", "UTF-16BE-bom", "UTF-16BE",
+ "UTF-16", "UTF-16");
+ _testHttpLenient("text/xml;charset=UTF-16", "UTF-16BE-bom", "UTF-16BE",
+ "UTF-16BE", "UTF-16BE");
+ _testHttpLenient("text/xml", "UTF-8-bom", "US-ASCII", null, "US-ASCII");
+
+ _testHttpLenient("text/xml;charset=UTF-16BE", "UTF-16BE-bom",
+ "UTF-16BE", null, "UTF-16BE");
+ _testHttpLenient("text/xml;charset=UTF-16BE", "UTF-16BE-bom",
+ "UTF-16BE", "UTF-16", "UTF-16");
+ _testHttpLenient("text/xml;charset=UTF-16BE", "UTF-16BE-bom",
+ "UTF-16BE", "UTF-16BE", "UTF-16BE");
+ _testHttpLenient("text/xml;charset=UTF-16", "no-bom", "UTF-16BE",
+ "UTF-16BE", "UTF-16BE");
+ _testHttpLenient("text/xml;charset=UTF-16", "no-bom", "UTF-16BE", null,
+ "UTF-16");
+
+ _testHttpLenient("text/html", "no-bom", "US-ASCII", "US-ASCII",
+ "US-ASCII");
+ _testHttpLenient("text/html", "no-bom", "US-ASCII", null, "US-ASCII");
+ _testHttpLenient("text/html;charset=UTF-8", "no-bom", "US-ASCII",
+ "UTF-8", "UTF-8");
+ _testHttpLenient("text/html;charset=UTF-16BE", "no-bom", "US-ASCII",
+ "UTF-8", "UTF-8");
+ }
+
+ public void _testAlternateDefaultEncoding(String cT, String bomEnc,
+ String streamEnc, String prologEnc, String alternateEnc)
+ throws Exception {
+ try {
+ InputStream is = getXmlStream(bomEnc, (prologEnc == null) ? XML1
+ : XML3, streamEnc, prologEnc);
+ XmlStreamReader.setDefaultEncoding(alternateEnc);
+ XmlStreamReader xmlReader = new XmlStreamReader(is, cT, false);
+ if (!streamEnc.equals("UTF-16")) {
+ // we can not assert things here because UTF-8, US-ASCII and
+ // ISO-8859-1 look alike for the chars used for detection
+ } else {
+ //String enc = (alternateEnc != null) ? alternateEnc : streamEnc;
+ assertEquals(xmlReader.getEncoding().substring(0,
+ streamEnc.length()), streamEnc);
+ }
+ } finally {
+ XmlStreamReader.setDefaultEncoding(null);
+ }
+ }
+
+ public void _testHttpValid(String cT, String bomEnc, String streamEnc,
+ String prologEnc) throws Exception {
+ InputStream is = getXmlStream(bomEnc,
+ (prologEnc == null) ? XML1 : XML3, streamEnc, prologEnc);
+ XmlStreamReader xmlReader = new XmlStreamReader(is, cT, false);
+ if (!streamEnc.equals("UTF-16")) {
+ // we can not assert things here because UTF-8, US-ASCII and
+ // ISO-8859-1 look alike for the chars used for detection
+ } else {
+ assertEquals(xmlReader.getEncoding().substring(0,
+ streamEnc.length()), streamEnc);
+ }
+ }
+
+ protected void _testHttpInvalid(String cT, String bomEnc, String streamEnc,
+ String prologEnc) throws Exception {
+ InputStream is = getXmlStream(bomEnc,
+ (prologEnc == null) ? XML2 : XML3, streamEnc, prologEnc);
+ try {
+ new XmlStreamReader(is, cT, false);
+ fail("It should have failed for HTTP Content-type " + cT + ", BOM "
+ + bomEnc + ", streamEnc " + streamEnc + " and prologEnc "
+ + prologEnc);
+ } catch (IOException ex) {
+ assertTrue(ex.getMessage().indexOf("Invalid encoding,") > -1);
+ }
+ }
+
+ protected void _testHttpLenient(String cT, String bomEnc, String streamEnc,
+ String prologEnc, String shouldbe) throws Exception {
+ InputStream is = getXmlStream(bomEnc,
+ (prologEnc == null) ? XML2 : XML3, streamEnc, prologEnc);
+ XmlStreamReader xmlReader = new XmlStreamReader(is, cT, true);
+ assertEquals(xmlReader.getEncoding(), shouldbe);
+ }
+
+ private static final String ENCODING_ATTRIBUTE_XML = "<?xml version=\"1.0\" ?> \n"
+ + "<atom:feed xmlns:atom=\"http://www.w3.org/2005/Atom\">\n"
+ + "\n"
+ + " <atom:entry>\n"
+ + " <atom:title encoding='base64'><![CDATA\n"
+ + "aW5nTGluZSIgLz4";
+
+ public void testEncodingAttributeXML() throws Exception {
+ InputStream is = new ByteArrayInputStream(ENCODING_ATTRIBUTE_XML
+ .getBytes("UTF-8"));
+ XmlStreamReader xmlReader = new XmlStreamReader(is, "", true);
+ assertEquals(xmlReader.getEncoding(), "UTF-8");
+ }
+
+ // XML Stream generator
+
+ private static final int[] NO_BOM_BYTES = {};
+ private static final int[] UTF_16BE_BOM_BYTES = { 0xFE, 0xFF };
+ private static final int[] UTF_16LE_BOM_BYTES = { 0xFF, 0XFE };
+ private static final int[] UTF_8_BOM_BYTES = { 0xEF, 0xBB, 0xBF };
+
+ private static final Map<String, int[]> BOMs = new HashMap<String, int[]>();
+
+ static {
+ BOMs.put("no-bom", NO_BOM_BYTES);
+ BOMs.put("UTF-16BE-bom", UTF_16BE_BOM_BYTES);
+ BOMs.put("UTF-16LE-bom", UTF_16LE_BOM_BYTES);
+ BOMs.put("UTF-16-bom", NO_BOM_BYTES); // it's added by the writer
+ BOMs.put("UTF-8-bom", UTF_8_BOM_BYTES);
+ }
+
+ private static final MessageFormat XML = new MessageFormat(
+ "<root>{2}</root>");
+ private static final MessageFormat XML_WITH_PROLOG = new MessageFormat(
+ "<?xml version=\"1.0\"?>\n<root>{2}</root>");
+ private static final MessageFormat XML_WITH_PROLOG_AND_ENCODING_DOUBLE_QUOTES = new MessageFormat(
+ "<?xml version=\"1.0\" encoding=\"{1}\"?>\n<root>{2}</root>");
+ private static final MessageFormat XML_WITH_PROLOG_AND_ENCODING_SINGLE_QUOTES = new MessageFormat(
+ "<?xml version=\"1.0\" encoding=''{1}''?>\n<root>{2}</root>");
+ private static final MessageFormat XML_WITH_PROLOG_AND_ENCODING_SPACED_SINGLE_QUOTES = new MessageFormat(
+ "<?xml version=\"1.0\" encoding = \t \n \r''{1}''?>\n<root>{2}</root>");
+
+ private static final MessageFormat INFO = new MessageFormat(
+ "\nBOM : {0}\nDoc : {1}\nStream Enc : {2}\nProlog Enc : {3}\n");
+
+ private static final Map<String,MessageFormat> XMLs = new HashMap<String,MessageFormat>();
+
+ static {
+ XMLs.put(XML1, XML);
+ XMLs.put(XML2, XML_WITH_PROLOG);
+ XMLs.put(XML3, XML_WITH_PROLOG_AND_ENCODING_DOUBLE_QUOTES);
+ XMLs.put(XML4, XML_WITH_PROLOG_AND_ENCODING_SINGLE_QUOTES);
+ XMLs.put(XML5, XML_WITH_PROLOG_AND_ENCODING_SPACED_SINGLE_QUOTES);
+ }
+
+ /**
+ *
+ * @param bomType no-bom, UTF-16BE-bom, UTF-16LE-bom, UTF-8-bom
+ * @param xmlType xml, xml-prolog, xml-prolog-charset
+ * @return XML stream
+ */
+ protected InputStream getXmlStream(String bomType, String xmlType,
+ String streamEnc, String prologEnc) throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream(1024);
+ int[] bom = (int[]) BOMs.get(bomType);
+ if (bom == null) {
+ bom = new int[0];
+ }
+ MessageFormat xml = (MessageFormat) XMLs.get(xmlType);
+ for (int i = 0; i < bom.length; i++) {
+ baos.write(bom[i]);
+ }
+ Writer writer = new OutputStreamWriter(baos, streamEnc);
+ String info = INFO.format(new Object[] { bomType, xmlType, prologEnc });
+ String xmlDoc = xml.format(new Object[] { streamEnc, prologEnc, info });
+ writer.write(xmlDoc);
+
+ // PADDDING TO TEST THINGS WORK BEYOND PUSHBACK_SIZE
+ writer.write("<da>\n");
+ for (int i = 0; i < 10000; i++) {
+ writer.write("<do/>\n");
+ }
+ writer.write("</da>\n");
+
+ writer.close();
+ return new ByteArrayInputStream(baos.toByteArray());
+ }
+}
Propchange: commons/proper/io/trunk/src/test/org/apache/commons/io/input/XmlStreamReaderTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: commons/proper/io/trunk/src/test/org/apache/commons/io/input/XmlStreamReaderTest.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: commons/proper/io/trunk/src/test/org/apache/commons/io/output/XmlStreamWriterTest.java
URL: http://svn.apache.org/viewvc/commons/proper/io/trunk/src/test/org/apache/commons/io/output/XmlStreamWriterTest.java?rev=1004090&view=auto
==============================================================================
--- commons/proper/io/trunk/src/test/org/apache/commons/io/output/XmlStreamWriterTest.java (added)
+++ commons/proper/io/trunk/src/test/org/apache/commons/io/output/XmlStreamWriterTest.java Mon Oct 4 02:59:49 2010
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.io.output;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+/**
+ * @author Herve Boutemy
+ * @version $Id$
+ */
+public class XmlStreamWriterTest extends TestCase {
+ /** french */
+ private static final String TEXT_LATIN1 = "eacute: \u00E9";
+ /** greek */
+ private static final String TEXT_LATIN7 = "alpha: \u03B1";
+ /** euro support */
+ private static final String TEXT_LATIN15 = "euro: \u20AC";
+ /** japanese */
+ private static final String TEXT_EUC_JP = "hiragana A: \u3042";
+ /** Unicode: support everything */
+ private static final String TEXT_UNICODE = TEXT_LATIN1 + ", " + TEXT_LATIN7
+ + ", " + TEXT_LATIN15 + ", " + TEXT_EUC_JP;
+
+ private static String createXmlContent(String text, String encoding) {
+ String xmlDecl = "<?xml version=\"1.0\"?>";
+ if (encoding != null) {
+ xmlDecl = "<?xml version=\"1.0\" encoding=\"" + encoding + "\"?>";
+ }
+ String xml = xmlDecl + "\n<text>" + text + "</text>";
+ return xml;
+ }
+
+ private static void checkXmlContent(String xml, String encoding)
+ throws IOException {
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ XmlStreamWriter writer = new XmlStreamWriter(out);
+ writer.write(xml);
+ writer.close();
+ byte[] xmlContent = out.toByteArray();
+ String result = new String(xmlContent, encoding);
+ assertEquals(xml, result);
+ }
+
+ private static void checkXmlWriter(String text, String encoding)
+ throws IOException {
+ String xml = createXmlContent(text, encoding);
+ String effectiveEncoding = (encoding == null) ? "UTF-8" : encoding;
+ checkXmlContent(xml, effectiveEncoding);
+ }
+
+ public void testNoXmlHeader() throws IOException {
+ String xml = "<text>text with no XML header</text>";
+ checkXmlContent(xml, "UTF-8");
+ }
+
+ public void testEmpty() throws IOException {
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ XmlStreamWriter writer = new XmlStreamWriter(out);
+ writer.flush();
+ writer.write("");
+ writer.flush();
+ writer.write(".");
+ writer.flush();
+ writer.close();
+ }
+
+ public void testDefaultEncoding() throws IOException {
+ checkXmlWriter(TEXT_UNICODE, null);
+ }
+
+ public void testUTF8Encoding() throws IOException {
+ checkXmlWriter(TEXT_UNICODE, "UTF-8");
+ }
+
+ public void testUTF16Encoding() throws IOException {
+ checkXmlWriter(TEXT_UNICODE, "UTF-16");
+ }
+
+ public void testUTF16BEEncoding() throws IOException {
+ checkXmlWriter(TEXT_UNICODE, "UTF-16BE");
+ }
+
+ public void testUTF16LEEncoding() throws IOException {
+ checkXmlWriter(TEXT_UNICODE, "UTF-16LE");
+ }
+
+ public void testLatin1Encoding() throws IOException {
+ checkXmlWriter(TEXT_LATIN1, "ISO-8859-1");
+ }
+
+ public void testLatin7Encoding() throws IOException {
+ checkXmlWriter(TEXT_LATIN7, "ISO-8859-7");
+ }
+
+ public void testLatin15Encoding() throws IOException {
+ checkXmlWriter(TEXT_LATIN15, "ISO-8859-15");
+ }
+
+ public void testEUC_JPEncoding() throws IOException {
+ checkXmlWriter(TEXT_EUC_JP, "EUC-JP");
+ }
+
+ public void testEBCDICEncoding() throws IOException {
+ checkXmlWriter("simple text in EBCDIC", "CP1047");
+ }
+}
Propchange: commons/proper/io/trunk/src/test/org/apache/commons/io/output/XmlStreamWriterTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: commons/proper/io/trunk/src/test/org/apache/commons/io/output/XmlStreamWriterTest.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL