You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ni...@apache.org on 2010/10/04 04:59:49 UTC
svn commit: r1004090 - in /commons/proper/io/trunk/src: java/org/apache/commons/io/input/ java/org/apache/commons/io/output/ test/org/apache/commons/io/input/ test/org/apache/commons/io/output/

Author: niallp
Date: Mon Oct  4 02:59:49 2010
New Revision: 1004090

URL: http://svn.apache.org/viewvc?rev=1004090&view=rev
Log:
IO-162 add Xml(Stream)Reader/Writer from ROME - thanks to Hervé Boutemy for the patch

Added:
    commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReader.java   (with props)
    commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReaderException.java   (with props)
    commons/proper/io/trunk/src/java/org/apache/commons/io/output/XmlStreamWriter.java   (with props)
    commons/proper/io/trunk/src/test/org/apache/commons/io/input/XmlStreamReaderTest.java   (with props)
    commons/proper/io/trunk/src/test/org/apache/commons/io/output/XmlStreamWriterTest.java   (with props)

Added: commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReader.java
URL: http://svn.apache.org/viewvc/commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReader.java?rev=1004090&view=auto
==============================================================================
--- commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReader.java (added)
+++ commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReader.java Mon Oct  4 02:59:49 2010
@@ -0,0 +1,703 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.io.input;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.StringReader;
+import java.net.URL;
+import java.net.URLConnection;
+import java.net.HttpURLConnection;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+import java.text.MessageFormat;
+
+/**
+ * Character stream that handles all the necessary Voodo to figure out the
+ * charset encoding of the XML document within the stream.
+ * <p>
+ * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader.
+ * This one IS a character stream.
+ * <p>
+ * All this has to be done without consuming characters from the stream, if not
+ * the XML parser will not recognized the document as a valid XML. This is not
+ * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers
+ * right now, XmlStreamReader handles it and things work in all parsers).
+ * <p>
+ * The XmlStreamReader class handles the charset encoding of XML documents in
+ * Files, raw streams and HTTP streams by offering a wide set of constructors.
+ * <p>
+ * By default the charset encoding detection is lenient, the constructor with
+ * the lenient flag can be used for an script (following HTTP MIME and XML
+ * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a
+ * href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
+ * Determining the character encoding of a feed</a>.
+ * <p>
+ * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under
+ * Apache License 2.0.
+ *
+ * @author Alejandro Abdelnur
+ * @version $Id$
+ * @see XmlStreamWriter
+ */
+public class XmlStreamReader extends Reader {
+    private static final int BUFFER_SIZE = 4096;
+
+    private static final String UTF_8 = "UTF-8";
+
+    private static final String US_ASCII = "US-ASCII";
+
+    private static final String UTF_16BE = "UTF-16BE";
+
+    private static final String UTF_16LE = "UTF-16LE";
+
+    private static final String UTF_16 = "UTF-16";
+
+    private static final String EBCDIC = "CP1047";
+
+    private static String staticDefaultEncoding = null;
+
+    private Reader reader;
+
+    private String encoding;
+
+    private String defaultEncoding;
+
+    /**
+     * Sets the default encoding to use if none is set in HTTP content-type, XML
+     * prolog and the rules based on content-type are not adequate.
+     * <p>
+     * If it is set to NULL the content-type based rules are used.
+     * <p>
+     * By default it is NULL.
+     *
+     * @param encoding charset encoding to default to.
+     */
+    public static void setDefaultEncoding(String encoding) {
+        staticDefaultEncoding = encoding;
+    }
+
+    /**
+     * Returns the default encoding to use if none is set in HTTP content-type,
+     * XML prolog and the rules based on content-type are not adequate.
+     * <p>
+     * If it is NULL the content-type based rules are used.
+     *
+     * @return the default encoding to use.
+     */
+    public static String getDefaultEncoding() {
+        return staticDefaultEncoding;
+    }
+
+    /**
+     * Creates a Reader for a File.
+     * <p>
+     * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset,
+     * if this is also missing defaults to UTF-8.
+     * <p>
+     * It does a lenient charset encoding detection, check the constructor with
+     * the lenient parameter for details.
+     *
+     * @param file File to create a Reader from.
+     * @throws IOException thrown if there is a problem reading the file.
+     */
+    public XmlStreamReader(File file) throws IOException {
+        this(new FileInputStream(file));
+    }
+
+    /**
+     * Creates a Reader for a raw InputStream.
+     * <p>
+     * It follows the same logic used for files.
+     * <p>
+     * It does a lenient charset encoding detection, check the constructor with
+     * the lenient parameter for details.
+     *
+     * @param is InputStream to create a Reader from.
+     * @throws IOException thrown if there is a problem reading the stream.
+     */
+    public XmlStreamReader(InputStream is) throws IOException {
+        this(is, true);
+    }
+
+    /**
+     * Creates a Reader for a raw InputStream.
+     * <p>
+     * It follows the same logic used for files.
+     * <p>
+     * If lenient detection is indicated and the detection above fails as per
+     * specifications it then attempts the following:
+     * <p>
+     * If the content type was 'text/html' it replaces it with 'text/xml' and
+     * tries the detection again.
+     * <p>
+     * Else if the XML prolog had a charset encoding that encoding is used.
+     * <p>
+     * Else if the content type had a charset encoding that encoding is used.
+     * <p>
+     * Else 'UTF-8' is used.
+     * <p>
+     * If lenient detection is indicated an XmlStreamReaderException is never
+     * thrown.
+     *
+     * @param is InputStream to create a Reader from.
+     * @param lenient indicates if the charset encoding detection should be
+     *        relaxed.
+     * @throws IOException thrown if there is a problem reading the stream.
+     * @throws XmlStreamReaderException thrown if the charset encoding could not
+     *         be determined according to the specs.
+     */
+    public XmlStreamReader(InputStream is, boolean lenient) throws IOException,
+            XmlStreamReaderException {
+        defaultEncoding = staticDefaultEncoding;
+        try {
+            doRawStream(is, lenient);
+        } catch (XmlStreamReaderException ex) {
+            if (!lenient) {
+                throw ex;
+            } else {
+                doLenientDetection(null, ex);
+            }
+        }
+    }
+
+    /**
+     * Creates a Reader using the InputStream of a URL.
+     * <p>
+     * If the URL is not of type HTTP and there is not 'content-type' header in
+     * the fetched data it uses the same logic used for Files.
+     * <p>
+     * If the URL is a HTTP Url or there is a 'content-type' header in the
+     * fetched data it uses the same logic used for an InputStream with
+     * content-type.
+     * <p>
+     * It does a lenient charset encoding detection, check the constructor with
+     * the lenient parameter for details.
+     *
+     * @param url URL to create a Reader from.
+     * @throws IOException thrown if there is a problem reading the stream of
+     *         the URL.
+     */
+    public XmlStreamReader(URL url) throws IOException {
+        this(url.openConnection());
+    }
+
+    /**
+     * Creates a Reader using the InputStream of a URLConnection.
+     * <p>
+     * If the URLConnection is not of type HttpURLConnection and there is not
+     * 'content-type' header in the fetched data it uses the same logic used for
+     * files.
+     * <p>
+     * If the URLConnection is a HTTP Url or there is a 'content-type' header in
+     * the fetched data it uses the same logic used for an InputStream with
+     * content-type.
+     * <p>
+     * It does a lenient charset encoding detection, check the constructor with
+     * the lenient parameter for details.
+     *
+     * @param conn URLConnection to create a Reader from.
+     * @throws IOException thrown if there is a problem reading the stream of
+     *         the URLConnection.
+     */
+    public XmlStreamReader(URLConnection conn) throws IOException {
+        defaultEncoding = staticDefaultEncoding;
+        boolean lenient = true;
+        if (conn instanceof HttpURLConnection) {
+            try {
+                doHttpStream(conn.getInputStream(), conn.getContentType(),
+                        lenient);
+            } catch (XmlStreamReaderException ex) {
+                doLenientDetection(conn.getContentType(), ex);
+            }
+        } else if (conn.getContentType() != null) {
+            try {
+                doHttpStream(conn.getInputStream(), conn.getContentType(),
+                        lenient);
+            } catch (XmlStreamReaderException ex) {
+                doLenientDetection(conn.getContentType(), ex);
+            }
+        } else {
+            try {
+                doRawStream(conn.getInputStream(), lenient);
+            } catch (XmlStreamReaderException ex) {
+                doLenientDetection(null, ex);
+            }
+        }
+    }
+
+    /**
+     * Creates a Reader using an InputStream an the associated content-type
+     * header.
+     * <p>
+     * First it checks if the stream has BOM. If there is not BOM checks the
+     * content-type encoding. If there is not content-type encoding checks the
+     * XML prolog encoding. If there is not XML prolog encoding uses the default
+     * encoding mandated by the content-type MIME type.
+     * <p>
+     * It does a lenient charset encoding detection, check the constructor with
+     * the lenient parameter for details.
+     *
+     * @param is InputStream to create the reader from.
+     * @param httpContentType content-type header to use for the resolution of
+     *        the charset encoding.
+     * @throws IOException thrown if there is a problem reading the file.
+     */
+    public XmlStreamReader(InputStream is, String httpContentType)
+            throws IOException {
+        this(is, httpContentType, true);
+    }
+
+    /**
+     * Creates a Reader using an InputStream an the associated content-type
+     * header. This constructor is lenient regarding the encoding detection.
+     * <p>
+     * First it checks if the stream has BOM. If there is not BOM checks the
+     * content-type encoding. If there is not content-type encoding checks the
+     * XML prolog encoding. If there is not XML prolog encoding uses the default
+     * encoding mandated by the content-type MIME type.
+     * <p>
+     * If lenient detection is indicated and the detection above fails as per
+     * specifications it then attempts the following:
+     * <p>
+     * If the content type was 'text/html' it replaces it with 'text/xml' and
+     * tries the detection again.
+     * <p>
+     * Else if the XML prolog had a charset encoding that encoding is used.
+     * <p>
+     * Else if the content type had a charset encoding that encoding is used.
+     * <p>
+     * Else 'UTF-8' is used.
+     * <p>
+     * If lenient detection is indicated an XmlStreamReaderException is never
+     * thrown.
+     *
+     * @param is InputStream to create the reader from.
+     * @param httpContentType content-type header to use for the resolution of
+     *        the charset encoding.
+     * @param lenient indicates if the charset encoding detection should be
+     *        relaxed.
+     * @throws IOException thrown if there is a problem reading the file.
+     * @throws XmlStreamReaderException thrown if the charset encoding could not
+     *         be determined according to the specs.
+     */
+    public XmlStreamReader(InputStream is, String httpContentType,
+            boolean lenient, String defaultEncoding) throws IOException,
+            XmlStreamReaderException {
+        this.defaultEncoding = (defaultEncoding == null) ? staticDefaultEncoding
+                : defaultEncoding;
+        try {
+            doHttpStream(is, httpContentType, lenient);
+        } catch (XmlStreamReaderException ex) {
+            if (!lenient) {
+                throw ex;
+            } else {
+                doLenientDetection(httpContentType, ex);
+            }
+        }
+    }
+
+    /**
+     * Creates a Reader using an InputStream an the associated content-type
+     * header. This constructor is lenient regarding the encoding detection.
+     * <p>
+     * First it checks if the stream has BOM. If there is not BOM checks the
+     * content-type encoding. If there is not content-type encoding checks the
+     * XML prolog encoding. If there is not XML prolog encoding uses the default
+     * encoding mandated by the content-type MIME type.
+     * <p>
+     * If lenient detection is indicated and the detection above fails as per
+     * specifications it then attempts the following:
+     * <p>
+     * If the content type was 'text/html' it replaces it with 'text/xml' and
+     * tries the detection again.
+     * <p>
+     * Else if the XML prolog had a charset encoding that encoding is used.
+     * <p>
+     * Else if the content type had a charset encoding that encoding is used.
+     * <p>
+     * Else 'UTF-8' is used.
+     * <p>
+     * If lenient detection is indicated an XmlStreamReaderException is never
+     * thrown.
+     *
+     * @param is InputStream to create the reader from.
+     * @param httpContentType content-type header to use for the resolution of
+     *        the charset encoding.
+     * @param lenient indicates if the charset encoding detection should be
+     *        relaxed.
+     * @throws IOException thrown if there is a problem reading the file.
+     * @throws XmlStreamReaderException thrown if the charset encoding could not
+     *         be determined according to the specs.
+     */
+    public XmlStreamReader(InputStream is, String httpContentType,
+            boolean lenient) throws IOException, XmlStreamReaderException {
+        this(is, httpContentType, lenient, null);
+    }
+
+    private void doLenientDetection(String httpContentType,
+            XmlStreamReaderException ex) throws IOException {
+        if (httpContentType != null) {
+            if (httpContentType.startsWith("text/html")) {
+                httpContentType = httpContentType.substring("text/html"
+                        .length());
+                httpContentType = "text/xml" + httpContentType;
+                try {
+                    doHttpStream(ex.getInputStream(), httpContentType, true);
+                    ex = null;
+                } catch (XmlStreamReaderException ex2) {
+                    ex = ex2;
+                }
+            }
+        }
+        if (ex != null) {
+            String encoding = ex.getXmlEncoding();
+            if (encoding == null) {
+                encoding = ex.getContentTypeEncoding();
+            }
+            if (encoding == null) {
+                encoding = (defaultEncoding == null) ? UTF_8 : defaultEncoding;
+            }
+            prepareReader(ex.getInputStream(), encoding);
+        }
+    }
+
+    /**
+     * Returns the charset encoding of the XmlStreamReader.
+     *
+     * @return charset encoding.
+     */
+    public String getEncoding() {
+        return encoding;
+    }
+
+    public int read(char[] buf, int offset, int len) throws IOException {
+        return reader.read(buf, offset, len);
+    }
+
+    /**
+     * Closes the XmlStreamReader stream.
+     *
+     * @throws IOException thrown if there was a problem closing the stream.
+     */
+    public void close() throws IOException {
+        reader.close();
+    }
+
+    private void doRawStream(InputStream is, boolean lenient)
+            throws IOException {
+        BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
+        String bomEnc = getBOMEncoding(pis);
+        String xmlGuessEnc = getXMLGuessEncoding(pis);
+        String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
+        String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
+        prepareReader(pis, encoding);
+    }
+
+    private void doHttpStream(InputStream is, String httpContentType,
+            boolean lenient) throws IOException {
+        BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
+        String cTMime = getContentTypeMime(httpContentType);
+        String cTEnc = getContentTypeEncoding(httpContentType);
+        String bomEnc = getBOMEncoding(pis);
+        String xmlGuessEnc = getXMLGuessEncoding(pis);
+        String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
+        String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc,
+                xmlGuessEnc, xmlEnc, pis, lenient);
+        prepareReader(pis, encoding);
+    }
+
+    private void prepareReader(InputStream is, String encoding)
+            throws IOException {
+        reader = new InputStreamReader(is, encoding);
+        this.encoding = encoding;
+    }
+
+    // InputStream is passed for XmlStreamReaderException creation only
+    private String calculateRawEncoding(String bomEnc, String xmlGuessEnc,
+            String xmlEnc, InputStream is) throws IOException {
+        String encoding;
+        if (bomEnc == null) {
+            if (xmlGuessEnc == null || xmlEnc == null) {
+                encoding = (defaultEncoding == null) ? UTF_8 : defaultEncoding;
+            } else if (xmlEnc.equals(UTF_16)
+                    && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc
+                            .equals(UTF_16LE))) {
+                encoding = xmlGuessEnc;
+            } else {
+                encoding = xmlEnc;
+            }
+        } else if (bomEnc.equals(UTF_8)) {
+            if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
+                throw new XmlStreamReaderException(RAW_EX_1
+                        .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }),
+                        bomEnc, xmlGuessEnc, xmlEnc, is);
+            }
+            if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
+                throw new XmlStreamReaderException(RAW_EX_1
+                        .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }),
+                        bomEnc, xmlGuessEnc, xmlEnc, is);
+            }
+            encoding = UTF_8;
+        } else if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
+            if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
+                throw new IOException(RAW_EX_1.format(new Object[] { bomEnc,
+                        xmlGuessEnc, xmlEnc }));
+            }
+            if (xmlEnc != null && !xmlEnc.equals(UTF_16)
+                    && !xmlEnc.equals(bomEnc)) {
+                throw new XmlStreamReaderException(RAW_EX_1
+                        .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }),
+                        bomEnc, xmlGuessEnc, xmlEnc, is);
+            }
+            encoding = bomEnc;
+        } else {
+            throw new XmlStreamReaderException(RAW_EX_2.format(new Object[] {
+                    bomEnc, xmlGuessEnc, xmlEnc }), bomEnc, xmlGuessEnc,
+                    xmlEnc, is);
+        }
+        return encoding;
+    }
+
+    // InputStream is passed for XmlStreamReaderException creation only
+    private String calculateHttpEncoding(String cTMime, String cTEnc,
+            String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is,
+            boolean lenient) throws IOException {
+        String encoding;
+        if (lenient & xmlEnc != null) {
+            encoding = xmlEnc;
+        } else {
+            boolean appXml = isAppXml(cTMime);
+            boolean textXml = isTextXml(cTMime);
+            if (appXml || textXml) {
+                if (cTEnc == null) {
+                    if (appXml) {
+                        encoding = calculateRawEncoding(bomEnc, xmlGuessEnc,
+                                xmlEnc, is);
+                    } else {
+                        encoding = (defaultEncoding == null) ? US_ASCII
+                                : defaultEncoding;
+                    }
+                } else if (bomEnc != null
+                        && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
+                    throw new XmlStreamReaderException(HTTP_EX_1
+                            .format(new Object[] { cTMime, cTEnc, bomEnc,
+                                    xmlGuessEnc, xmlEnc }), cTMime, cTEnc,
+                            bomEnc, xmlGuessEnc, xmlEnc, is);
+                } else if (cTEnc.equals(UTF_16)) {
+                    if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
+                        encoding = bomEnc;
+                    } else {
+                        throw new XmlStreamReaderException(HTTP_EX_2
+                                .format(new Object[] { cTMime, cTEnc, bomEnc,
+                                        xmlGuessEnc, xmlEnc }), cTMime, cTEnc,
+                                bomEnc, xmlGuessEnc, xmlEnc, is);
+                    }
+                } else {
+                    encoding = cTEnc;
+                }
+            } else {
+                throw new XmlStreamReaderException(HTTP_EX_3
+                        .format(new Object[] { cTMime, cTEnc, bomEnc,
+                                xmlGuessEnc, xmlEnc }), cTMime, cTEnc, bomEnc,
+                        xmlGuessEnc, xmlEnc, is);
+            }
+        }
+        return encoding;
+    }
+
+    // returns MIME type or NULL if httpContentType is NULL
+    private static String getContentTypeMime(String httpContentType) {
+        String mime = null;
+        if (httpContentType != null) {
+            int i = httpContentType.indexOf(";");
+            mime = ((i == -1) ? httpContentType : httpContentType.substring(0,
+                    i)).trim();
+        }
+        return mime;
+    }
+
+    private static final Pattern CHARSET_PATTERN = Pattern
+            .compile("charset=[\"']?([.[^; \"']]*)[\"']?");
+
+    // returns charset parameter value, NULL if not present, NULL if
+    // httpContentType is NULL
+    private static String getContentTypeEncoding(String httpContentType) {
+        String encoding = null;
+        if (httpContentType != null) {
+            int i = httpContentType.indexOf(";");
+            if (i > -1) {
+                String postMime = httpContentType.substring(i + 1);
+                Matcher m = CHARSET_PATTERN.matcher(postMime);
+                encoding = (m.find()) ? m.group(1) : null;
+                encoding = (encoding != null) ? encoding.toUpperCase() : null;
+            }
+        }
+        return encoding;
+    }
+
+    // returns the BOM in the stream, NULL if not present,
+    // if there was BOM the in the stream it is consumed
+    private static String getBOMEncoding(BufferedInputStream is)
+            throws IOException {
+        String encoding = null;
+        int[] bytes = new int[3];
+        is.mark(3);
+        bytes[0] = is.read();
+        bytes[1] = is.read();
+        bytes[2] = is.read();
+
+        if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
+            encoding = UTF_16BE;
+            is.reset();
+            is.read();
+            is.read();
+        } else if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
+            encoding = UTF_16LE;
+            is.reset();
+            is.read();
+            is.read();
+        } else if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
+            encoding = UTF_8;
+        } else {
+            is.reset();
+        }
+        return encoding;
+    }
+
+    // returns the best guess for the encoding by looking the first bytes of the
+    // stream, '<?'
+    private static String getXMLGuessEncoding(BufferedInputStream is)
+            throws IOException {
+        String encoding = null;
+        int[] bytes = new int[4];
+        is.mark(4);
+        bytes[0] = is.read();
+        bytes[1] = is.read();
+        bytes[2] = is.read();
+        bytes[3] = is.read();
+        is.reset();
+
+        if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00
+                && bytes[3] == 0x3F) {
+            encoding = UTF_16BE;
+        } else if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F
+                && bytes[3] == 0x00) {
+            encoding = UTF_16LE;
+        } else if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78
+                && bytes[3] == 0x6D) {
+            encoding = UTF_8;
+        } else if (bytes[0] == 0x4C && bytes[1] == 0x6F && bytes[2] == 0xA7
+                && bytes[3] == 0x94) {
+            encoding = EBCDIC;
+        }
+        return encoding;
+    }
+
+    public static final Pattern ENCODING_PATTERN = Pattern.compile(
+            "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))",
+            Pattern.MULTILINE);
+
+    // returns the encoding declared in the <?xml encoding=...?>, NULL if none
+    private static String getXmlProlog(BufferedInputStream is, String guessedEnc)
+            throws IOException {
+        String encoding = null;
+        if (guessedEnc != null) {
+            byte[] bytes = new byte[BUFFER_SIZE];
+            is.mark(BUFFER_SIZE);
+            int offset = 0;
+            int max = BUFFER_SIZE;
+            int c = is.read(bytes, offset, max);
+            int firstGT = -1;
+            String xmlProlog = null;
+            while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) {
+                offset += c;
+                max -= c;
+                c = is.read(bytes, offset, max);
+                xmlProlog = new String(bytes, 0, offset, guessedEnc);
+                firstGT = xmlProlog.indexOf('>');
+            }
+            if (firstGT == -1) {
+                if (c == -1) {
+                    throw new IOException("Unexpected end of XML stream");
+                } else {
+                    throw new IOException(
+                            "XML prolog or ROOT element not found on first "
+                                    + offset + " bytes");
+                }
+            }
+            int bytesRead = offset;
+            if (bytesRead > 0) {
+                is.reset();
+                BufferedReader bReader = new BufferedReader(new StringReader(
+                        xmlProlog.substring(0, firstGT + 1)));
+                StringBuffer prolog = new StringBuffer();
+                String line = bReader.readLine();
+                while (line != null) {
+                    prolog.append(line);
+                    line = bReader.readLine();
+                }
+                Matcher m = ENCODING_PATTERN.matcher(prolog);
+                if (m.find()) {
+                    encoding = m.group(1).toUpperCase();
+                    encoding = encoding.substring(1, encoding.length() - 1);
+                }
+            }
+        }
+        return encoding;
+    }
+
+    // indicates if the MIME type belongs to the APPLICATION XML family
+    private static boolean isAppXml(String mime) {
+        return mime != null
+                && (mime.equals("application/xml")
+                        || mime.equals("application/xml-dtd")
+                        || mime
+                                .equals("application/xml-external-parsed-entity") || (mime
+                        .startsWith("application/") && mime.endsWith("+xml")));
+    }
+
+    // indicates if the MIME type belongs to the TEXT XML family
+    private static boolean isTextXml(String mime) {
+        return mime != null
+                && (mime.equals("text/xml")
+                        || mime.equals("text/xml-external-parsed-entity") || (mime
+                        .startsWith("text/") && mime.endsWith("+xml")));
+    }
+
+    private static final MessageFormat RAW_EX_1 = new MessageFormat(
+            "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
+
+    private static final MessageFormat RAW_EX_2 = new MessageFormat(
+            "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
+
+    private static final MessageFormat HTTP_EX_1 = new MessageFormat(
+            "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
+
+    private static final MessageFormat HTTP_EX_2 = new MessageFormat(
+            "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
+
+    private static final MessageFormat HTTP_EX_3 = new MessageFormat(
+            "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
+
+}

Propchange: commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReader.java
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL

Added: commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReaderException.java
URL: http://svn.apache.org/viewvc/commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReaderException.java?rev=1004090&view=auto
==============================================================================
--- commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReaderException.java (added)
+++ commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReaderException.java Mon Oct  4 02:59:49 2010
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.io.input;
+
+import java.io.InputStream;
+import java.io.IOException;
+
+/**
+ * The XmlStreamReaderException is thrown by the XmlStreamReader constructors if
+ * the charset encoding can not be determined according to the XML 1.0
+ * specification and RFC 3023.
+ * <p>
+ * The exception returns the unconsumed InputStream to allow the application to
+ * do an alternate processing with the stream. Note that the original
+ * InputStream given to the XmlStreamReader cannot be used as that one has been
+ * already read.
+ *
+ * @author Alejandro Abdelnur
+ * @version $Id$
+ */
+public class XmlStreamReaderException extends IOException {
+    private final String bomEncoding;
+
+    private final String xmlGuessEncoding;
+
+    private final String xmlEncoding;
+
+    private final String contentTypeMime;
+
+    private final String contentTypeEncoding;
+
+    private final InputStream is;
+
+    /**
+     * Creates an exception instance if the charset encoding could not be
+     * determined.
+     * <p>
+     * Instances of this exception are thrown by the XmlStreamReader.
+     *
+     * @param msg message describing the reason for the exception.
+     * @param bomEnc BOM encoding.
+     * @param xmlGuessEnc XML guess encoding.
+     * @param xmlEnc XML prolog encoding.
+     * @param is the unconsumed InputStream.
+     */
+    public XmlStreamReaderException(String msg, String bomEnc,
+            String xmlGuessEnc, String xmlEnc, InputStream is) {
+        this(msg, null, null, bomEnc, xmlGuessEnc, xmlEnc, is);
+    }
+
+    /**
+     * Creates an exception instance if the charset encoding could not be
+     * determined.
+     * <p>
+     * Instances of this exception are thrown by the XmlStreamReader.
+     *
+     * @param msg message describing the reason for the exception.
+     * @param ctMime MIME type in the content-type.
+     * @param ctEnc encoding in the content-type.
+     * @param bomEnc BOM encoding.
+     * @param xmlGuessEnc XML guess encoding.
+     * @param xmlEnc XML prolog encoding.
+     * @param is the unconsumed InputStream.
+     */
+    public XmlStreamReaderException(String msg, String ctMime, String ctEnc,
+            String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is) {
+        super(msg);
+        contentTypeMime = ctMime;
+        contentTypeEncoding = ctEnc;
+        bomEncoding = bomEnc;
+        xmlGuessEncoding = xmlGuessEnc;
+        xmlEncoding = xmlEnc;
+        this.is = is;
+    }
+
+    /**
+     * Returns the BOM encoding found in the InputStream.
+     *
+     * @return the BOM encoding, null if none.
+     */
+    public String getBomEncoding() {
+        return bomEncoding;
+    }
+
+    /**
+     * Returns the encoding guess based on the first bytes of the InputStream.
+     *
+     * @return the encoding guess, null if it couldn't be guessed.
+     */
+    public String getXmlGuessEncoding() {
+        return xmlGuessEncoding;
+    }
+
+    /**
+     * Returns the encoding found in the XML prolog of the InputStream.
+     *
+     * @return the encoding of the XML prolog, null if none.
+     */
+    public String getXmlEncoding() {
+        return xmlEncoding;
+    }
+
+    /**
+     * Returns the MIME type in the content-type used to attempt determining the
+     * encoding.
+     *
+     * @return the MIME type in the content-type, null if there was not
+     *         content-type or the encoding detection did not involve HTTP.
+     */
+    public String getContentTypeMime() {
+        return contentTypeMime;
+    }
+
+    /**
+     * Returns the encoding in the content-type used to attempt determining the
+     * encoding.
+     *
+     * @return the encoding in the content-type, null if there was not
+     *         content-type, no encoding in it or the encoding detection did not
+     *         involve HTTP.
+     */
+    public String getContentTypeEncoding() {
+        return contentTypeEncoding;
+    }
+
+    /**
+     * Returns the unconsumed InputStream to allow the application to do an
+     * alternate encoding detection on the InputStream.
+     *
+     * @return the unconsumed InputStream.
+     */
+    public InputStream getInputStream() {
+        return is;
+    }
+}

Propchange: commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReaderException.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: commons/proper/io/trunk/src/java/org/apache/commons/io/input/XmlStreamReaderException.java
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL

Added: commons/proper/io/trunk/src/java/org/apache/commons/io/output/XmlStreamWriter.java
URL: http://svn.apache.org/viewvc/commons/proper/io/trunk/src/java/org/apache/commons/io/output/XmlStreamWriter.java?rev=1004090&view=auto
==============================================================================
--- commons/proper/io/trunk/src/java/org/apache/commons/io/output/XmlStreamWriter.java (added)
+++ commons/proper/io/trunk/src/java/org/apache/commons/io/output/XmlStreamWriter.java Mon Oct  4 02:59:49 2010
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.io.output;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.StringWriter;
+import java.io.Writer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.input.XmlStreamReader;
+
+/**
+ * Character stream that handles all the necessary Voodo to figure out the
+ * charset encoding of the XML document written to the stream.
+ *
+ * @author Herve Boutemy
+ * @version $Id$
+ * @see XmlStreamReader
+ */
+public class XmlStreamWriter extends Writer {
+    private static final int BUFFER_SIZE = 4096;
+
+    private StringWriter xmlPrologWriter = new StringWriter(BUFFER_SIZE);
+
+    private OutputStream out;
+
+    private Writer writer;
+
+    private String encoding;
+
+    public XmlStreamWriter(OutputStream out) {
+        this.out = out;
+    }
+
+    public XmlStreamWriter(File file) throws FileNotFoundException {
+        this(new FileOutputStream(file));
+    }
+
+    public String getEncoding() {
+        return encoding;
+    }
+
+    public void close() throws IOException {
+        if (writer == null) {
+            encoding = "UTF-8";
+            writer = new OutputStreamWriter(out, encoding);
+            writer.write(xmlPrologWriter.toString());
+        }
+        writer.close();
+    }
+
+    public void flush() throws IOException {
+        if (writer != null) {
+            writer.flush();
+        }
+    }
+
+    private void detectEncoding(char[] cbuf, int off, int len)
+            throws IOException {
+        int size = len;
+        StringBuffer xmlProlog = xmlPrologWriter.getBuffer();
+        if (xmlProlog.length() + len > BUFFER_SIZE) {
+            size = BUFFER_SIZE - xmlProlog.length();
+        }
+        xmlPrologWriter.write(cbuf, off, size);
+
+        // try to determine encoding
+        if (xmlProlog.length() >= 5) {
+            if (xmlProlog.substring(0, 5).equals("<?xml")) {
+                // try to extract encoding from XML prolog
+                int xmlPrologEnd = xmlProlog.indexOf("?>");
+                if (xmlPrologEnd > 0) {
+                    // ok, full XML prolog written: let's extract encoding
+                    Matcher m = ENCODING_PATTERN.matcher(xmlProlog.substring(0,
+                            xmlPrologEnd));
+                    if (m.find()) {
+                        encoding = m.group(1).toUpperCase();
+                        encoding = encoding.substring(1, encoding.length() - 1);
+                    } else {
+                        // no encoding found in XML prolog: using default
+                        // encoding
+                        encoding = "UTF-8";
+                    }
+                } else {
+                    if (xmlProlog.length() >= BUFFER_SIZE) {
+                        // no encoding found in first characters: using default
+                        // encoding
+                        encoding = "UTF-8";
+                    }
+                }
+            } else {
+                // no XML prolog: using default encoding
+                encoding = "UTF-8";
+            }
+            if (encoding != null) {
+                // encoding has been chosen: let's do it
+                xmlPrologWriter = null;
+                writer = new OutputStreamWriter(out, encoding);
+                writer.write(xmlProlog.toString());
+                if (len > size) {
+                    writer.write(cbuf, off + size, len - size);
+                }
+            }
+        }
+    }
+
+    public void write(char[] cbuf, int off, int len) throws IOException {
+        if (xmlPrologWriter != null) {
+            detectEncoding(cbuf, off, len);
+        } else {
+            writer.write(cbuf, off, len);
+        }
+    }
+
+    static final Pattern ENCODING_PATTERN = XmlStreamReader.ENCODING_PATTERN;
+}

Propchange: commons/proper/io/trunk/src/java/org/apache/commons/io/output/XmlStreamWriter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: commons/proper/io/trunk/src/java/org/apache/commons/io/output/XmlStreamWriter.java
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL

Added: commons/proper/io/trunk/src/test/org/apache/commons/io/input/XmlStreamReaderTest.java
URL: http://svn.apache.org/viewvc/commons/proper/io/trunk/src/test/org/apache/commons/io/input/XmlStreamReaderTest.java?rev=1004090&view=auto
==============================================================================
--- commons/proper/io/trunk/src/test/org/apache/commons/io/input/XmlStreamReaderTest.java (added)
+++ commons/proper/io/trunk/src/test/org/apache/commons/io/input/XmlStreamReaderTest.java Mon Oct  4 02:59:49 2010
@@ -0,0 +1,360 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.io.input;
+
+import java.io.*;
+import java.text.MessageFormat;
+import java.util.HashMap;
+import java.util.Map;
+
+import junit.framework.TestCase;
+
+/**
+ * @author Alejandro Abdelnur
+ */
+public class XmlStreamReaderTest extends TestCase {
+    private static final String XML5 = "xml-prolog-encoding-spaced-single-quotes";
+    private static final String XML4 = "xml-prolog-encoding-single-quotes";
+    private static final String XML3 = "xml-prolog-encoding-double-quotes";
+    private static final String XML2 = "xml-prolog";
+    private static final String XML1 = "xml";
+
+    protected void _testRawNoBomValid(String encoding) throws Exception {
+        InputStream is = getXmlStream("no-bom", XML1, encoding, encoding);
+        XmlStreamReader xmlReader = new XmlStreamReader(is, false);
+        assertEquals(xmlReader.getEncoding(), "UTF-8");
+
+        is = getXmlStream("no-bom", XML2, encoding, encoding);
+        xmlReader = new XmlStreamReader(is);
+        assertEquals(xmlReader.getEncoding(), "UTF-8");
+
+        is = getXmlStream("no-bom", XML3, encoding, encoding);
+        xmlReader = new XmlStreamReader(is);
+        assertEquals(xmlReader.getEncoding(), encoding);
+
+        is = getXmlStream("no-bom", XML4, encoding, encoding);
+        xmlReader = new XmlStreamReader(is);
+        assertEquals(xmlReader.getEncoding(), encoding);
+
+        is = getXmlStream("no-bom", XML5, encoding, encoding);
+        xmlReader = new XmlStreamReader(is);
+        assertEquals(xmlReader.getEncoding(), encoding);
+    }
+
+    protected void _testRawNoBomInvalid(String encoding) throws Exception {
+        InputStream is = getXmlStream("no-bom", XML3, encoding, encoding);
+        try {
+            new XmlStreamReader(is, false);
+            fail("It should have failed");
+        } catch (IOException ex) {
+            assertTrue(ex.getMessage().indexOf("Invalid encoding,") > -1);
+        }
+    }
+
+    public void testRawNoBom() throws Exception {
+        _testRawNoBomValid("US-ASCII");
+        _testRawNoBomValid("UTF-8");
+        _testRawNoBomValid("ISO-8859-1");
+        _testRawNoBomValid("CP1047");
+    }
+
+    protected void _testRawBomValid(String encoding) throws Exception {
+        InputStream is = getXmlStream(encoding + "-bom", XML3, encoding,
+                encoding);
+        XmlStreamReader xmlReader = new XmlStreamReader(is, false);
+        if (!encoding.equals("UTF-16")) {
+            assertEquals(xmlReader.getEncoding(), encoding);
+        } else {
+            assertEquals(xmlReader.getEncoding()
+                    .substring(0, encoding.length()), encoding);
+        }
+    }
+
+    protected void _testRawBomInvalid(String bomEnc, String streamEnc,
+            String prologEnc) throws Exception {
+        InputStream is = getXmlStream(bomEnc, XML3, streamEnc, prologEnc);
+        try {
+            XmlStreamReader xmlReader = new XmlStreamReader(is, false);
+            String foundEnc = xmlReader.getEncoding();
+            fail("It should have failed for BOM " + bomEnc + ", streamEnc "
+                    + streamEnc + " and prologEnc " + prologEnc + ": found "
+                    + foundEnc);
+        } catch (IOException ex) {
+            assertTrue(ex.getMessage().indexOf("Invalid encoding,") > -1);
+        }
+    }
+
+    public void testRawBom() throws Exception {
+        _testRawBomValid("UTF-8");
+        _testRawBomValid("UTF-16BE");
+        _testRawBomValid("UTF-16LE");
+        _testRawBomValid("UTF-16");
+
+        _testRawBomInvalid("UTF-8-bom", "US-ASCII", "US-ASCII");
+        _testRawBomInvalid("UTF-8-bom", "ISO-8859-1", "ISO-8859-1");
+        _testRawBomInvalid("UTF-8-bom", "UTF-8", "UTF-16");
+        _testRawBomInvalid("UTF-8-bom", "UTF-8", "UTF-16BE");
+        _testRawBomInvalid("UTF-8-bom", "UTF-8", "UTF-16LE");
+        _testRawBomInvalid("UTF-16BE-bom", "UTF-16BE", "UTF-16LE");
+        _testRawBomInvalid("UTF-16LE-bom", "UTF-16LE", "UTF-16BE");
+        _testRawBomInvalid("UTF-16LE-bom", "UTF-16LE", "UTF-8");
+    }
+
+    public void testHttp() throws Exception {
+        _testHttpValid("application/xml", "no-bom", "US-ASCII", null);
+        _testHttpValid("application/xml", "UTF-8-bom", "US-ASCII", null);
+        _testHttpValid("application/xml", "UTF-8-bom", "UTF-8", null);
+        _testHttpValid("application/xml", "UTF-8-bom", "UTF-8", "UTF-8");
+        _testHttpValid("application/xml;charset=UTF-8", "UTF-8-bom", "UTF-8",
+                null);
+        _testHttpValid("application/xml;charset=\"UTF-8\"", "UTF-8-bom",
+                "UTF-8", null);
+        _testHttpValid("application/xml;charset='UTF-8'", "UTF-8-bom", "UTF-8",
+                null);
+        _testHttpValid("application/xml;charset=UTF-8", "UTF-8-bom", "UTF-8",
+                "UTF-8");
+        _testHttpValid("application/xml;charset=UTF-16", "UTF-16BE-bom",
+                "UTF-16BE", null);
+        _testHttpValid("application/xml;charset=UTF-16", "UTF-16BE-bom",
+                "UTF-16BE", "UTF-16");
+        _testHttpValid("application/xml;charset=UTF-16", "UTF-16BE-bom",
+                "UTF-16BE", "UTF-16BE");
+
+        _testHttpInvalid("application/xml;charset=UTF-16BE", "UTF-16BE-bom",
+                "UTF-16BE", null);
+        _testHttpInvalid("application/xml;charset=UTF-16BE", "UTF-16BE-bom",
+                "UTF-16BE", "UTF-16");
+        _testHttpInvalid("application/xml;charset=UTF-16BE", "UTF-16BE-bom",
+                "UTF-16BE", "UTF-16BE");
+        _testHttpInvalid("application/xml", "UTF-8-bom", "US-ASCII", "US-ASCII");
+        _testHttpInvalid("application/xml;charset=UTF-16", "UTF-16LE", "UTF-8",
+                "UTF-8");
+        _testHttpInvalid("application/xml;charset=UTF-16", "no-bom",
+                "UTF-16BE", "UTF-16BE");
+
+        _testHttpValid("text/xml", "no-bom", "US-ASCII", null);
+        _testHttpValid("text/xml;charset=UTF-8", "UTF-8-bom", "UTF-8", "UTF-8");
+        _testHttpValid("text/xml;charset=UTF-8", "UTF-8-bom", "UTF-8", null);
+        _testHttpValid("text/xml;charset=UTF-16", "UTF-16BE-bom", "UTF-16BE",
+                null);
+        _testHttpValid("text/xml;charset=UTF-16", "UTF-16BE-bom", "UTF-16BE",
+                "UTF-16");
+        _testHttpValid("text/xml;charset=UTF-16", "UTF-16BE-bom", "UTF-16BE",
+                "UTF-16BE");
+        _testHttpValid("text/xml", "UTF-8-bom", "US-ASCII", null);
+
+        _testAlternateDefaultEncoding("application/xml", "UTF-8-bom", "UTF-8",
+                null, null);
+        _testAlternateDefaultEncoding("application/xml", "no-bom", "US-ASCII",
+                null, "US-ASCII");
+        _testAlternateDefaultEncoding("application/xml", "UTF-8-bom", "UTF-8",
+                null, "UTF-8");
+        _testAlternateDefaultEncoding("text/xml", "no-bom", "US-ASCII", null,
+                null);
+        _testAlternateDefaultEncoding("text/xml", "no-bom", "US-ASCII", null,
+                "US-ASCII");
+        _testAlternateDefaultEncoding("text/xml", "no-bom", "US-ASCII", null,
+                "UTF-8");
+
+        _testHttpInvalid("text/xml;charset=UTF-16BE", "UTF-16BE-bom",
+                "UTF-16BE", null);
+        _testHttpInvalid("text/xml;charset=UTF-16BE", "UTF-16BE-bom",
+                "UTF-16BE", "UTF-16");
+        _testHttpInvalid("text/xml;charset=UTF-16BE", "UTF-16BE-bom",
+                "UTF-16BE", "UTF-16BE");
+        _testHttpInvalid("text/xml;charset=UTF-16", "no-bom", "UTF-16BE",
+                "UTF-16BE");
+        _testHttpInvalid("text/xml;charset=UTF-16", "no-bom", "UTF-16BE", null);
+
+        _testHttpLenient("text/xml", "no-bom", "US-ASCII", null, "US-ASCII");
+        _testHttpLenient("text/xml;charset=UTF-8", "UTF-8-bom", "UTF-8",
+                "UTF-8", "UTF-8");
+        _testHttpLenient("text/xml;charset=UTF-8", "UTF-8-bom", "UTF-8", null,
+                "UTF-8");
+        _testHttpLenient("text/xml;charset=UTF-16", "UTF-16BE-bom", "UTF-16BE",
+                null, "UTF-16BE");
+        _testHttpLenient("text/xml;charset=UTF-16", "UTF-16BE-bom", "UTF-16BE",
+                "UTF-16", "UTF-16");
+        _testHttpLenient("text/xml;charset=UTF-16", "UTF-16BE-bom", "UTF-16BE",
+                "UTF-16BE", "UTF-16BE");
+        _testHttpLenient("text/xml", "UTF-8-bom", "US-ASCII", null, "US-ASCII");
+
+        _testHttpLenient("text/xml;charset=UTF-16BE", "UTF-16BE-bom",
+                "UTF-16BE", null, "UTF-16BE");
+        _testHttpLenient("text/xml;charset=UTF-16BE", "UTF-16BE-bom",
+                "UTF-16BE", "UTF-16", "UTF-16");
+        _testHttpLenient("text/xml;charset=UTF-16BE", "UTF-16BE-bom",
+                "UTF-16BE", "UTF-16BE", "UTF-16BE");
+        _testHttpLenient("text/xml;charset=UTF-16", "no-bom", "UTF-16BE",
+                "UTF-16BE", "UTF-16BE");
+        _testHttpLenient("text/xml;charset=UTF-16", "no-bom", "UTF-16BE", null,
+                "UTF-16");
+
+        _testHttpLenient("text/html", "no-bom", "US-ASCII", "US-ASCII",
+                "US-ASCII");
+        _testHttpLenient("text/html", "no-bom", "US-ASCII", null, "US-ASCII");
+        _testHttpLenient("text/html;charset=UTF-8", "no-bom", "US-ASCII",
+                "UTF-8", "UTF-8");
+        _testHttpLenient("text/html;charset=UTF-16BE", "no-bom", "US-ASCII",
+                "UTF-8", "UTF-8");
+    }
+
+    public void _testAlternateDefaultEncoding(String cT, String bomEnc,
+            String streamEnc, String prologEnc, String alternateEnc)
+            throws Exception {
+        try {
+            InputStream is = getXmlStream(bomEnc, (prologEnc == null) ? XML1
+                    : XML3, streamEnc, prologEnc);
+            XmlStreamReader.setDefaultEncoding(alternateEnc);
+            XmlStreamReader xmlReader = new XmlStreamReader(is, cT, false);
+            if (!streamEnc.equals("UTF-16")) {
+                // we can not assert things here because UTF-8, US-ASCII and
+                // ISO-8859-1 look alike for the chars used for detection
+            } else {
+                //String enc = (alternateEnc != null) ? alternateEnc : streamEnc;
+                assertEquals(xmlReader.getEncoding().substring(0,
+                        streamEnc.length()), streamEnc);
+            }
+        } finally {
+            XmlStreamReader.setDefaultEncoding(null);
+        }
+    }
+
+    public void _testHttpValid(String cT, String bomEnc, String streamEnc,
+            String prologEnc) throws Exception {
+        InputStream is = getXmlStream(bomEnc,
+                (prologEnc == null) ? XML1 : XML3, streamEnc, prologEnc);
+        XmlStreamReader xmlReader = new XmlStreamReader(is, cT, false);
+        if (!streamEnc.equals("UTF-16")) {
+            // we can not assert things here because UTF-8, US-ASCII and
+            // ISO-8859-1 look alike for the chars used for detection
+        } else {
+            assertEquals(xmlReader.getEncoding().substring(0,
+                    streamEnc.length()), streamEnc);
+        }
+    }
+
+    protected void _testHttpInvalid(String cT, String bomEnc, String streamEnc,
+            String prologEnc) throws Exception {
+        InputStream is = getXmlStream(bomEnc,
+                (prologEnc == null) ? XML2 : XML3, streamEnc, prologEnc);
+        try {
+            new XmlStreamReader(is, cT, false);
+            fail("It should have failed for HTTP Content-type " + cT + ", BOM "
+                    + bomEnc + ", streamEnc " + streamEnc + " and prologEnc "
+                    + prologEnc);
+        } catch (IOException ex) {
+            assertTrue(ex.getMessage().indexOf("Invalid encoding,") > -1);
+        }
+    }
+
+    protected void _testHttpLenient(String cT, String bomEnc, String streamEnc,
+            String prologEnc, String shouldbe) throws Exception {
+        InputStream is = getXmlStream(bomEnc,
+                (prologEnc == null) ? XML2 : XML3, streamEnc, prologEnc);
+        XmlStreamReader xmlReader = new XmlStreamReader(is, cT, true);
+        assertEquals(xmlReader.getEncoding(), shouldbe);
+    }
+
+    private static final String ENCODING_ATTRIBUTE_XML = "<?xml version=\"1.0\" ?> \n"
+            + "<atom:feed xmlns:atom=\"http://www.w3.org/2005/Atom\">\n"
+            + "\n"
+            + "  <atom:entry>\n"
+            + "    <atom:title encoding='base64'><![CDATA\n"
+            + "aW5nTGluZSIgLz4";
+
+    public void testEncodingAttributeXML() throws Exception {
+        InputStream is = new ByteArrayInputStream(ENCODING_ATTRIBUTE_XML
+                .getBytes("UTF-8"));
+        XmlStreamReader xmlReader = new XmlStreamReader(is, "", true);
+        assertEquals(xmlReader.getEncoding(), "UTF-8");
+    }
+
+    // XML Stream generator
+
+    private static final int[] NO_BOM_BYTES = {};
+    private static final int[] UTF_16BE_BOM_BYTES = { 0xFE, 0xFF };
+    private static final int[] UTF_16LE_BOM_BYTES = { 0xFF, 0XFE };
+    private static final int[] UTF_8_BOM_BYTES = { 0xEF, 0xBB, 0xBF };
+
+    private static final Map<String, int[]> BOMs = new HashMap<String, int[]>();
+
+    static {
+        BOMs.put("no-bom", NO_BOM_BYTES);
+        BOMs.put("UTF-16BE-bom", UTF_16BE_BOM_BYTES);
+        BOMs.put("UTF-16LE-bom", UTF_16LE_BOM_BYTES);
+        BOMs.put("UTF-16-bom", NO_BOM_BYTES); // it's added by the writer
+        BOMs.put("UTF-8-bom", UTF_8_BOM_BYTES);
+    }
+
+    private static final MessageFormat XML = new MessageFormat(
+            "<root>{2}</root>");
+    private static final MessageFormat XML_WITH_PROLOG = new MessageFormat(
+            "<?xml version=\"1.0\"?>\n<root>{2}</root>");
+    private static final MessageFormat XML_WITH_PROLOG_AND_ENCODING_DOUBLE_QUOTES = new MessageFormat(
+            "<?xml version=\"1.0\" encoding=\"{1}\"?>\n<root>{2}</root>");
+    private static final MessageFormat XML_WITH_PROLOG_AND_ENCODING_SINGLE_QUOTES = new MessageFormat(
+            "<?xml version=\"1.0\" encoding=''{1}''?>\n<root>{2}</root>");
+    private static final MessageFormat XML_WITH_PROLOG_AND_ENCODING_SPACED_SINGLE_QUOTES = new MessageFormat(
+            "<?xml version=\"1.0\" encoding =  \t \n \r''{1}''?>\n<root>{2}</root>");
+
+    private static final MessageFormat INFO = new MessageFormat(
+            "\nBOM : {0}\nDoc : {1}\nStream Enc : {2}\nProlog Enc : {3}\n");
+
+    private static final Map<String,MessageFormat> XMLs = new HashMap<String,MessageFormat>();
+
+    static {
+        XMLs.put(XML1, XML);
+        XMLs.put(XML2, XML_WITH_PROLOG);
+        XMLs.put(XML3, XML_WITH_PROLOG_AND_ENCODING_DOUBLE_QUOTES);
+        XMLs.put(XML4, XML_WITH_PROLOG_AND_ENCODING_SINGLE_QUOTES);
+        XMLs.put(XML5, XML_WITH_PROLOG_AND_ENCODING_SPACED_SINGLE_QUOTES);
+    }
+
+    /**
+     *
+     * @param bomType no-bom, UTF-16BE-bom, UTF-16LE-bom, UTF-8-bom
+     * @param xmlType xml, xml-prolog, xml-prolog-charset
+     * @return XML stream
+     */
+    protected InputStream getXmlStream(String bomType, String xmlType,
+            String streamEnc, String prologEnc) throws IOException {
+        ByteArrayOutputStream baos = new ByteArrayOutputStream(1024);
+        int[] bom = (int[]) BOMs.get(bomType);
+        if (bom == null) {
+            bom = new int[0];
+        }
+        MessageFormat xml = (MessageFormat) XMLs.get(xmlType);
+        for (int i = 0; i < bom.length; i++) {
+            baos.write(bom[i]);
+        }
+        Writer writer = new OutputStreamWriter(baos, streamEnc);
+        String info = INFO.format(new Object[] { bomType, xmlType, prologEnc });
+        String xmlDoc = xml.format(new Object[] { streamEnc, prologEnc, info });
+        writer.write(xmlDoc);
+
+        // PADDDING TO TEST THINGS WORK BEYOND PUSHBACK_SIZE
+        writer.write("<da>\n");
+        for (int i = 0; i < 10000; i++) {
+            writer.write("<do/>\n");
+        }
+        writer.write("</da>\n");
+
+        writer.close();
+        return new ByteArrayInputStream(baos.toByteArray());
+    }
+}

Propchange: commons/proper/io/trunk/src/test/org/apache/commons/io/input/XmlStreamReaderTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: commons/proper/io/trunk/src/test/org/apache/commons/io/input/XmlStreamReaderTest.java
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL

Added: commons/proper/io/trunk/src/test/org/apache/commons/io/output/XmlStreamWriterTest.java
URL: http://svn.apache.org/viewvc/commons/proper/io/trunk/src/test/org/apache/commons/io/output/XmlStreamWriterTest.java?rev=1004090&view=auto
==============================================================================
--- commons/proper/io/trunk/src/test/org/apache/commons/io/output/XmlStreamWriterTest.java (added)
+++ commons/proper/io/trunk/src/test/org/apache/commons/io/output/XmlStreamWriterTest.java Mon Oct  4 02:59:49 2010
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.io.output;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+/**
+ * @author Herve Boutemy
+ * @version $Id$
+ */
+public class XmlStreamWriterTest extends TestCase {
+    /** french */
+    private static final String TEXT_LATIN1 = "eacute: \u00E9";
+    /** greek */
+    private static final String TEXT_LATIN7 = "alpha: \u03B1";
+    /** euro support */
+    private static final String TEXT_LATIN15 = "euro: \u20AC";
+    /** japanese */
+    private static final String TEXT_EUC_JP = "hiragana A: \u3042";
+    /** Unicode: support everything */
+    private static final String TEXT_UNICODE = TEXT_LATIN1 + ", " + TEXT_LATIN7
+            + ", " + TEXT_LATIN15 + ", " + TEXT_EUC_JP;
+
+    private static String createXmlContent(String text, String encoding) {
+        String xmlDecl = "<?xml version=\"1.0\"?>";
+        if (encoding != null) {
+            xmlDecl = "<?xml version=\"1.0\" encoding=\"" + encoding + "\"?>";
+        }
+        String xml = xmlDecl + "\n<text>" + text + "</text>";
+        return xml;
+    }
+
+    private static void checkXmlContent(String xml, String encoding)
+            throws IOException {
+        ByteArrayOutputStream out = new ByteArrayOutputStream();
+        XmlStreamWriter writer = new XmlStreamWriter(out);
+        writer.write(xml);
+        writer.close();
+        byte[] xmlContent = out.toByteArray();
+        String result = new String(xmlContent, encoding);
+        assertEquals(xml, result);
+    }
+
+    private static void checkXmlWriter(String text, String encoding)
+            throws IOException {
+        String xml = createXmlContent(text, encoding);
+        String effectiveEncoding = (encoding == null) ? "UTF-8" : encoding;
+        checkXmlContent(xml, effectiveEncoding);
+    }
+
+    public void testNoXmlHeader() throws IOException {
+        String xml = "<text>text with no XML header</text>";
+        checkXmlContent(xml, "UTF-8");
+    }
+
+    public void testEmpty() throws IOException {
+        ByteArrayOutputStream out = new ByteArrayOutputStream();
+        XmlStreamWriter writer = new XmlStreamWriter(out);
+        writer.flush();
+        writer.write("");
+        writer.flush();
+        writer.write(".");
+        writer.flush();
+        writer.close();
+    }
+
+    public void testDefaultEncoding() throws IOException {
+        checkXmlWriter(TEXT_UNICODE, null);
+    }
+
+    public void testUTF8Encoding() throws IOException {
+        checkXmlWriter(TEXT_UNICODE, "UTF-8");
+    }
+
+    public void testUTF16Encoding() throws IOException {
+        checkXmlWriter(TEXT_UNICODE, "UTF-16");
+    }
+
+    public void testUTF16BEEncoding() throws IOException {
+        checkXmlWriter(TEXT_UNICODE, "UTF-16BE");
+    }
+
+    public void testUTF16LEEncoding() throws IOException {
+        checkXmlWriter(TEXT_UNICODE, "UTF-16LE");
+    }
+
+    public void testLatin1Encoding() throws IOException {
+        checkXmlWriter(TEXT_LATIN1, "ISO-8859-1");
+    }
+
+    public void testLatin7Encoding() throws IOException {
+        checkXmlWriter(TEXT_LATIN7, "ISO-8859-7");
+    }
+
+    public void testLatin15Encoding() throws IOException {
+        checkXmlWriter(TEXT_LATIN15, "ISO-8859-15");
+    }
+
+    public void testEUC_JPEncoding() throws IOException {
+        checkXmlWriter(TEXT_EUC_JP, "EUC-JP");
+    }
+
+    public void testEBCDICEncoding() throws IOException {
+        checkXmlWriter("simple text in EBCDIC", "CP1047");
+    }
+}

Propchange: commons/proper/io/trunk/src/test/org/apache/commons/io/output/XmlStreamWriterTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: commons/proper/io/trunk/src/test/org/apache/commons/io/output/XmlStreamWriterTest.java
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL