You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by gg...@apache.org on 2021/01/21 16:36:44 UTC
[commons-io] 01/02: Sort members.
This is an automated email from the ASF dual-hosted git repository.
ggregory pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-io.git
commit e403b84597a4299d1cd2d1b93224eade2530128c
Author: Gary Gregory <ga...@gmail.com>
AuthorDate: Thu Jan 21 11:22:50 2021 -0500
Sort members.
---
.../apache/commons/io/input/XmlStreamReader.java | 708 ++++++++++-----------
1 file changed, 354 insertions(+), 354 deletions(-)
diff --git a/src/main/java/org/apache/commons/io/input/XmlStreamReader.java b/src/main/java/org/apache/commons/io/input/XmlStreamReader.java
index d77d382..ee80736 100644
--- a/src/main/java/org/apache/commons/io/input/XmlStreamReader.java
+++ b/src/main/java/org/apache/commons/io/input/XmlStreamReader.java
@@ -110,24 +110,161 @@ public class XmlStreamReader extends Reader {
new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94)
};
- private final Reader reader;
+ private static final Pattern CHARSET_PATTERN = Pattern
+ .compile("charset=[\"']?([.[^; \"']]*)[\"']?");
- private final String encoding;
+ /**
+ * Pattern capturing the encoding of the "xml" processing instruction.
+ */
+ public static final Pattern ENCODING_PATTERN = Pattern.compile(
+ "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))",
+ Pattern.MULTILINE);
- private final String defaultEncoding;
+ private static final String RAW_EX_1 =
+ "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
+
+ private static final String RAW_EX_2 =
+ "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
+
+ private static final String HTTP_EX_1 =
+ "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL";
+
+ private static final String HTTP_EX_2 =
+ "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
+
+ private static final String HTTP_EX_3 =
+ "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME";
/**
- * Returns the default encoding to use if none is set in HTTP content-type,
- * XML prolog and the rules based on content-type are not adequate.
- * <p>
- * If it is NULL the content-type based rules are used.
+ * Returns charset parameter value, NULL if not present, NULL if
+ * httpContentType is NULL.
*
- * @return the default encoding to use.
+ * @param httpContentType the HTTP content type
+ * @return The content type encoding (upcased)
*/
- public String getDefaultEncoding() {
- return defaultEncoding;
+ static String getContentTypeEncoding(final String httpContentType) {
+ String encoding = null;
+ if (httpContentType != null) {
+ final int i = httpContentType.indexOf(";");
+ if (i > -1) {
+ final String postMime = httpContentType.substring(i + 1);
+ final Matcher m = CHARSET_PATTERN.matcher(postMime);
+ encoding = m.find() ? m.group(1) : null;
+ encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
+ }
+ }
+ return encoding;
+ }
+
+ /**
+ * Returns MIME type or NULL if httpContentType is NULL.
+ *
+ * @param httpContentType the HTTP content type
+ * @return The mime content type
+ */
+ static String getContentTypeMime(final String httpContentType) {
+ String mime = null;
+ if (httpContentType != null) {
+ final int i = httpContentType.indexOf(";");
+ if (i >= 0) {
+ mime = httpContentType.substring(0, i);
+ } else {
+ mime = httpContentType;
+ }
+ mime = mime.trim();
+ }
+ return mime;
+ }
+
+ /**
+ * Returns the encoding declared in the <?xml encoding=...?>, NULL if none.
+ *
+ * @param inputStream InputStream to create the reader from.
+ * @param guessedEnc guessed encoding
+ * @return the encoding declared in the <?xml encoding=...?>
+ * @throws IOException thrown if there is a problem reading the stream.
+ */
+ private static String getXmlProlog(final InputStream inputStream, final String guessedEnc)
+ throws IOException {
+ String encoding = null;
+ if (guessedEnc != null) {
+ final byte[] bytes = new byte[BUFFER_SIZE];
+ inputStream.mark(BUFFER_SIZE);
+ int offset = 0;
+ int max = BUFFER_SIZE;
+ int c = inputStream.read(bytes, offset, max);
+ int firstGT = -1;
+ String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
+ while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) {
+ offset += c;
+ max -= c;
+ c = inputStream.read(bytes, offset, max);
+ xmlProlog = new String(bytes, 0, offset, guessedEnc);
+ firstGT = xmlProlog.indexOf('>');
+ }
+ if (firstGT == -1) {
+ if (c == -1) {
+ throw new IOException("Unexpected end of XML stream");
+ }
+ throw new IOException(
+ "XML prolog or ROOT element not found on first "
+ + offset + " bytes");
+ }
+ final int bytesRead = offset;
+ if (bytesRead > 0) {
+ inputStream.reset();
+ final BufferedReader bReader = new BufferedReader(new StringReader(
+ xmlProlog.substring(0, firstGT + 1)));
+ final StringBuffer prolog = new StringBuffer();
+ String line;
+ while ((line = bReader.readLine()) != null) {
+ prolog.append(line);
+ }
+ final Matcher m = ENCODING_PATTERN.matcher(prolog);
+ if (m.find()) {
+ encoding = m.group(1).toUpperCase(Locale.ROOT);
+ encoding = encoding.substring(1, encoding.length() - 1);
+ }
+ }
+ }
+ return encoding;
+ }
+
+ /**
+ * Indicates if the MIME type belongs to the APPLICATION XML family.
+ *
+ * @param mime The mime type
+ * @return true if the mime type belongs to the APPLICATION XML family,
+ * otherwise false
+ */
+ static boolean isAppXml(final String mime) {
+ return mime != null &&
+ (mime.equals("application/xml") ||
+ mime.equals("application/xml-dtd") ||
+ mime.equals("application/xml-external-parsed-entity") ||
+ mime.startsWith("application/") && mime.endsWith("+xml"));
+ }
+
+ /**
+ * Indicates if the MIME type belongs to the TEXT XML family.
+ *
+ * @param mime The mime type
+ * @return true if the mime type belongs to the TEXT XML family,
+ * otherwise false
+ */
+ static boolean isTextXml(final String mime) {
+ return mime != null &&
+ (mime.equals("text/xml") ||
+ mime.equals("text/xml-external-parsed-entity") ||
+ mime.startsWith("text/") && mime.endsWith("+xml"));
}
+ private final Reader reader;
+
+ private final String encoding;
+
+ private final String defaultEncoding;
+
/**
* Creates a Reader for a File.
* <p>
@@ -229,62 +366,6 @@ public class XmlStreamReader extends Reader {
}
/**
- * Creates a Reader using the InputStream of a URL.
- * <p>
- * If the URL is not of type HTTP and there is not 'content-type' header in
- * the fetched data it uses the same logic used for Files.
- * <p>
- * If the URL is a HTTP Url or there is a 'content-type' header in the
- * fetched data it uses the same logic used for an InputStream with
- * content-type.
- * <p>
- * It does a lenient charset encoding detection, check the constructor with
- * the lenient parameter for details.
- *
- * @param url URL to create a Reader from.
- * @throws IOException thrown if there is a problem reading the stream of
- * the URL.
- */
- public XmlStreamReader(final URL url) throws IOException {
- this(Objects.requireNonNull(url, "url").openConnection(), null);
- }
-
- /**
- * Creates a Reader using the InputStream of a URLConnection.
- * <p>
- * If the URLConnection is not of type HttpURLConnection and there is not
- * 'content-type' header in the fetched data it uses the same logic used for
- * files.
- * <p>
- * If the URLConnection is a HTTP Url or there is a 'content-type' header in
- * the fetched data it uses the same logic used for an InputStream with
- * content-type.
- * <p>
- * It does a lenient charset encoding detection, check the constructor with
- * the lenient parameter for details.
- *
- * @param conn URLConnection to create a Reader from.
- * @param defaultEncoding The default encoding
- * @throws IOException thrown if there is a problem reading the stream of
- * the URLConnection.
- */
- public XmlStreamReader(final URLConnection conn, final String defaultEncoding) throws IOException {
- Objects.requireNonNull(conn, "conm");
- this.defaultEncoding = defaultEncoding;
- final boolean lenient = true;
- final String contentType = conn.getContentType();
- final InputStream inputStream = conn.getInputStream();
- final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, BUFFER_SIZE), false, BOMS);
- final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
- if (conn instanceof HttpURLConnection || contentType != null) {
- this.encoding = processHttpStream(bom, pis, contentType, lenient);
- } else {
- this.encoding = doRawStream(bom, pis, lenient);
- }
- this.reader = new InputStreamReader(pis, encoding);
- }
-
- /**
* Creates a Reader using an InputStream and the associated content-type
* header.
* <p>
@@ -306,6 +387,7 @@ public class XmlStreamReader extends Reader {
this(inputStream, httpContentType, true);
}
+
/**
* Creates a Reader using an InputStream and the associated content-type
* header. This constructor is lenient regarding the encoding detection.
@@ -335,19 +417,13 @@ public class XmlStreamReader extends Reader {
* the charset encoding.
* @param lenient indicates if the charset encoding detection should be
* relaxed.
- * @param defaultEncoding The default encoding
* @throws IOException thrown if there is a problem reading the file.
* @throws XmlStreamReaderException thrown if the charset encoding could not
* be determined according to the specs.
*/
public XmlStreamReader(final InputStream inputStream, final String httpContentType,
- final boolean lenient, final String defaultEncoding) throws IOException {
- Objects.requireNonNull(inputStream, "inputStream");
- this.defaultEncoding = defaultEncoding;
- final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, BUFFER_SIZE), false, BOMS);
- final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
- this.encoding = processHttpStream(bom, pis, httpContentType, lenient);
- this.reader = new InputStreamReader(pis, encoding);
+ final boolean lenient) throws IOException {
+ this(inputStream, httpContentType, lenient, null);
}
/**
@@ -379,127 +455,155 @@ public class XmlStreamReader extends Reader {
* the charset encoding.
* @param lenient indicates if the charset encoding detection should be
* relaxed.
+ * @param defaultEncoding The default encoding
* @throws IOException thrown if there is a problem reading the file.
* @throws XmlStreamReaderException thrown if the charset encoding could not
* be determined according to the specs.
*/
public XmlStreamReader(final InputStream inputStream, final String httpContentType,
- final boolean lenient) throws IOException {
- this(inputStream, httpContentType, lenient, null);
+ final boolean lenient, final String defaultEncoding) throws IOException {
+ Objects.requireNonNull(inputStream, "inputStream");
+ this.defaultEncoding = defaultEncoding;
+ final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, BUFFER_SIZE), false, BOMS);
+ final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
+ this.encoding = processHttpStream(bom, pis, httpContentType, lenient);
+ this.reader = new InputStreamReader(pis, encoding);
}
/**
- * Returns the charset encoding of the XmlStreamReader.
+ * Creates a Reader using the InputStream of a URL.
+ * <p>
+ * If the URL is not of type HTTP and there is not 'content-type' header in
+ * the fetched data it uses the same logic used for Files.
+ * <p>
+ * If the URL is a HTTP Url or there is a 'content-type' header in the
+ * fetched data it uses the same logic used for an InputStream with
+ * content-type.
+ * <p>
+ * It does a lenient charset encoding detection, check the constructor with
+ * the lenient parameter for details.
*
- * @return charset encoding.
+ * @param url URL to create a Reader from.
+ * @throws IOException thrown if there is a problem reading the stream of
+ * the URL.
*/
- public String getEncoding() {
- return encoding;
+ public XmlStreamReader(final URL url) throws IOException {
+ this(Objects.requireNonNull(url, "url").openConnection(), null);
}
/**
- * Invokes the underlying reader's {@code read(char[], int, int)} method.
- * @param buf the buffer to read the characters into
- * @param offset The start offset
- * @param len The number of bytes to read
- * @return the number of characters read or -1 if the end of stream
- * @throws IOException if an I/O error occurs
- */
- @Override
- public int read(final char[] buf, final int offset, final int len) throws IOException {
- return reader.read(buf, offset, len);
- }
-
- /**
- * Closes the XmlStreamReader stream.
+ * Creates a Reader using the InputStream of a URLConnection.
+ * <p>
+ * If the URLConnection is not of type HttpURLConnection and there is not
+ * 'content-type' header in the fetched data it uses the same logic used for
+ * files.
+ * <p>
+ * If the URLConnection is a HTTP Url or there is a 'content-type' header in
+ * the fetched data it uses the same logic used for an InputStream with
+ * content-type.
+ * <p>
+ * It does a lenient charset encoding detection, check the constructor with
+ * the lenient parameter for details.
*
- * @throws IOException thrown if there was a problem closing the stream.
+ * @param conn URLConnection to create a Reader from.
+ * @param defaultEncoding The default encoding
+ * @throws IOException thrown if there is a problem reading the stream of
+ * the URLConnection.
*/
- @Override
- public void close() throws IOException {
- reader.close();
+ public XmlStreamReader(final URLConnection conn, final String defaultEncoding) throws IOException {
+ Objects.requireNonNull(conn, "conm");
+ this.defaultEncoding = defaultEncoding;
+ final boolean lenient = true;
+ final String contentType = conn.getContentType();
+ final InputStream inputStream = conn.getInputStream();
+ final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, BUFFER_SIZE), false, BOMS);
+ final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
+ if (conn instanceof HttpURLConnection || contentType != null) {
+ this.encoding = processHttpStream(bom, pis, contentType, lenient);
+ } else {
+ this.encoding = doRawStream(bom, pis, lenient);
+ }
+ this.reader = new InputStreamReader(pis, encoding);
}
/**
- * Process the raw stream.
+ * Calculate the HTTP encoding.
*
- * @param bom BOMInputStream to detect byte order marks
- * @param pis BOMInputStream to guess XML encoding
+ * @param httpContentType The HTTP content type
+ * @param bomEnc BOM encoding
+ * @param xmlGuessEnc XML Guess encoding
+ * @param xmlEnc XML encoding
* @param lenient indicates if the charset encoding detection should be
* relaxed.
- * @return the encoding to be used
+ * @return the HTTP encoding
* @throws IOException thrown if there is a problem reading the stream.
*/
- private String doRawStream(final BOMInputStream bom, final BOMInputStream pis, final boolean lenient)
- throws IOException {
- final String bomEnc = bom.getBOMCharsetName();
- final String xmlGuessEnc = pis.getBOMCharsetName();
- final String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
- try {
- return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
- } catch (final XmlStreamReaderException ex) {
- if (lenient) {
- return doLenientDetection(null, ex);
+ String calculateHttpEncoding(final String httpContentType,
+ final String bomEnc, final String xmlGuessEnc, final String xmlEnc,
+ final boolean lenient) throws IOException {
+
+ // Lenient and has XML encoding
+ if (lenient && xmlEnc != null) {
+ return xmlEnc;
+ }
+
+ // Determine mime/encoding content types from HTTP Content Type
+ final String cTMime = getContentTypeMime(httpContentType);
+ final String cTEnc = getContentTypeEncoding(httpContentType);
+ final boolean appXml = isAppXml(cTMime);
+ final boolean textXml = isTextXml(cTMime);
+
+ // Mime type NOT "application/xml" or "text/xml"
+ if (!appXml && !textXml) {
+ final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
+ throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
+ }
+
+ // No content type encoding
+ if (cTEnc == null) {
+ if (appXml) {
+ return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
}
- throw ex;
+ return defaultEncoding == null ? US_ASCII : defaultEncoding;
}
- }
- /**
- * Process a HTTP stream.
- *
- * @param bom BOMInputStream to detect byte order marks
- * @param pis BOMInputStream to guess XML encoding
- * @param httpContentType The HTTP content type
- * @param lenient indicates if the charset encoding detection should be
- * relaxed.
- * @return the encoding to be used
- * @throws IOException thrown if there is a problem reading the stream.
- */
- private String processHttpStream(final BOMInputStream bom, final BOMInputStream pis, final String httpContentType,
- final boolean lenient) throws IOException {
- final String bomEnc = bom.getBOMCharsetName();
- final String xmlGuessEnc = pis.getBOMCharsetName();
- final String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
- try {
- return calculateHttpEncoding(httpContentType, bomEnc, xmlGuessEnc, xmlEnc, lenient);
- } catch (final XmlStreamReaderException ex) {
- if (lenient) {
- return doLenientDetection(httpContentType, ex);
+ // UTF-16BE or UTF-16LE content type encoding
+ if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
+ if (bomEnc != null) {
+ final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
+ throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
}
- throw ex;
+ return cTEnc;
}
- }
- /**
- * Do lenient detection.
- *
- * @param httpContentType content-type header to use for the resolution of
- * the charset encoding.
- * @param ex The thrown exception
- * @return the encoding
- * @throws IOException thrown if there is a problem reading the stream.
- */
- private String doLenientDetection(String httpContentType,
- XmlStreamReaderException ex) throws IOException {
- if (httpContentType != null && httpContentType.startsWith("text/html")) {
- httpContentType = httpContentType.substring("text/html".length());
- httpContentType = "text/xml" + httpContentType;
- try {
- return calculateHttpEncoding(httpContentType, ex.getBomEncoding(),
- ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true);
- } catch (final XmlStreamReaderException ex2) {
- ex = ex2;
+ // UTF-16 content type encoding
+ if (cTEnc.equals(UTF_16)) {
+ if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
+ return bomEnc;
}
+ final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
+ throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
}
- String encoding = ex.getXmlEncoding();
- if (encoding == null) {
- encoding = ex.getContentTypeEncoding();
+
+ // UTF-32BE or UTF-132E content type encoding
+ if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
+ if (bomEnc != null) {
+ final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
+ throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
+ }
+ return cTEnc;
}
- if (encoding == null) {
- encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
+
+ // UTF-32 content type encoding
+ if (cTEnc.equals(UTF_32)) {
+ if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
+ return bomEnc;
+ }
+ final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
+ throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
}
- return encoding;
+
+ return cTEnc;
}
/**
@@ -570,234 +674,130 @@ public class XmlStreamReader extends Reader {
throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
}
+ /**
+ * Closes the XmlStreamReader stream.
+ *
+ * @throws IOException thrown if there was a problem closing the stream.
+ */
+ @Override
+ public void close() throws IOException {
+ reader.close();
+ }
/**
- * Calculate the HTTP encoding.
+ * Do lenient detection.
*
- * @param httpContentType The HTTP content type
- * @param bomEnc BOM encoding
- * @param xmlGuessEnc XML Guess encoding
- * @param xmlEnc XML encoding
- * @param lenient indicates if the charset encoding detection should be
- * relaxed.
- * @return the HTTP encoding
+ * @param httpContentType content-type header to use for the resolution of
+ * the charset encoding.
+ * @param ex The thrown exception
+ * @return the encoding
* @throws IOException thrown if there is a problem reading the stream.
*/
- String calculateHttpEncoding(final String httpContentType,
- final String bomEnc, final String xmlGuessEnc, final String xmlEnc,
- final boolean lenient) throws IOException {
-
- // Lenient and has XML encoding
- if (lenient && xmlEnc != null) {
- return xmlEnc;
- }
-
- // Determine mime/encoding content types from HTTP Content Type
- final String cTMime = getContentTypeMime(httpContentType);
- final String cTEnc = getContentTypeEncoding(httpContentType);
- final boolean appXml = isAppXml(cTMime);
- final boolean textXml = isTextXml(cTMime);
-
- // Mime type NOT "application/xml" or "text/xml"
- if (!appXml && !textXml) {
- final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
- throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
- }
-
- // No content type encoding
- if (cTEnc == null) {
- if (appXml) {
- return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
- }
- return defaultEncoding == null ? US_ASCII : defaultEncoding;
- }
-
- // UTF-16BE or UTF-16LE content type encoding
- if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
- if (bomEnc != null) {
- final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
- throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
- }
- return cTEnc;
- }
-
- // UTF-16 content type encoding
- if (cTEnc.equals(UTF_16)) {
- if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
- return bomEnc;
+ private String doLenientDetection(String httpContentType,
+ XmlStreamReaderException ex) throws IOException {
+ if (httpContentType != null && httpContentType.startsWith("text/html")) {
+ httpContentType = httpContentType.substring("text/html".length());
+ httpContentType = "text/xml" + httpContentType;
+ try {
+ return calculateHttpEncoding(httpContentType, ex.getBomEncoding(),
+ ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true);
+ } catch (final XmlStreamReaderException ex2) {
+ ex = ex2;
}
- final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
- throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
}
-
- // UTF-32BE or UTF-132E content type encoding
- if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
- if (bomEnc != null) {
- final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
- throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
- }
- return cTEnc;
+ String encoding = ex.getXmlEncoding();
+ if (encoding == null) {
+ encoding = ex.getContentTypeEncoding();
}
-
- // UTF-32 content type encoding
- if (cTEnc.equals(UTF_32)) {
- if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
- return bomEnc;
- }
- final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
- throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
+ if (encoding == null) {
+ encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
}
-
- return cTEnc;
+ return encoding;
}
/**
- * Returns MIME type or NULL if httpContentType is NULL.
+ * Process the raw stream.
*
- * @param httpContentType the HTTP content type
- * @return The mime content type
+ * @param bom BOMInputStream to detect byte order marks
+ * @param pis BOMInputStream to guess XML encoding
+ * @param lenient indicates if the charset encoding detection should be
+ * relaxed.
+ * @return the encoding to be used
+ * @throws IOException thrown if there is a problem reading the stream.
*/
- static String getContentTypeMime(final String httpContentType) {
- String mime = null;
- if (httpContentType != null) {
- final int i = httpContentType.indexOf(";");
- if (i >= 0) {
- mime = httpContentType.substring(0, i);
- } else {
- mime = httpContentType;
+ private String doRawStream(final BOMInputStream bom, final BOMInputStream pis, final boolean lenient)
+ throws IOException {
+ final String bomEnc = bom.getBOMCharsetName();
+ final String xmlGuessEnc = pis.getBOMCharsetName();
+ final String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
+ try {
+ return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
+ } catch (final XmlStreamReaderException ex) {
+ if (lenient) {
+ return doLenientDetection(null, ex);
}
- mime = mime.trim();
+ throw ex;
}
- return mime;
}
- private static final Pattern CHARSET_PATTERN = Pattern
- .compile("charset=[\"']?([.[^; \"']]*)[\"']?");
-
/**
- * Returns charset parameter value, NULL if not present, NULL if
- * httpContentType is NULL.
+ * Returns the default encoding to use if none is set in HTTP content-type,
+ * XML prolog and the rules based on content-type are not adequate.
+ * <p>
+ * If it is NULL the content-type based rules are used.
*
- * @param httpContentType the HTTP content type
- * @return The content type encoding (upcased)
+ * @return the default encoding to use.
*/
- static String getContentTypeEncoding(final String httpContentType) {
- String encoding = null;
- if (httpContentType != null) {
- final int i = httpContentType.indexOf(";");
- if (i > -1) {
- final String postMime = httpContentType.substring(i + 1);
- final Matcher m = CHARSET_PATTERN.matcher(postMime);
- encoding = m.find() ? m.group(1) : null;
- encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
- }
- }
- return encoding;
+ public String getDefaultEncoding() {
+ return defaultEncoding;
}
/**
- * Pattern capturing the encoding of the "xml" processing instruction.
- */
- public static final Pattern ENCODING_PATTERN = Pattern.compile(
- "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))",
- Pattern.MULTILINE);
-
- /**
- * Returns the encoding declared in the <?xml encoding=...?>, NULL if none.
+ * Returns the charset encoding of the XmlStreamReader.
*
- * @param inputStream InputStream to create the reader from.
- * @param guessedEnc guessed encoding
- * @return the encoding declared in the <?xml encoding=...?>
- * @throws IOException thrown if there is a problem reading the stream.
+ * @return charset encoding.
*/
- private static String getXmlProlog(final InputStream inputStream, final String guessedEnc)
- throws IOException {
- String encoding = null;
- if (guessedEnc != null) {
- final byte[] bytes = new byte[BUFFER_SIZE];
- inputStream.mark(BUFFER_SIZE);
- int offset = 0;
- int max = BUFFER_SIZE;
- int c = inputStream.read(bytes, offset, max);
- int firstGT = -1;
- String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
- while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) {
- offset += c;
- max -= c;
- c = inputStream.read(bytes, offset, max);
- xmlProlog = new String(bytes, 0, offset, guessedEnc);
- firstGT = xmlProlog.indexOf('>');
- }
- if (firstGT == -1) {
- if (c == -1) {
- throw new IOException("Unexpected end of XML stream");
- }
- throw new IOException(
- "XML prolog or ROOT element not found on first "
- + offset + " bytes");
- }
- final int bytesRead = offset;
- if (bytesRead > 0) {
- inputStream.reset();
- final BufferedReader bReader = new BufferedReader(new StringReader(
- xmlProlog.substring(0, firstGT + 1)));
- final StringBuffer prolog = new StringBuffer();
- String line;
- while ((line = bReader.readLine()) != null) {
- prolog.append(line);
- }
- final Matcher m = ENCODING_PATTERN.matcher(prolog);
- if (m.find()) {
- encoding = m.group(1).toUpperCase(Locale.ROOT);
- encoding = encoding.substring(1, encoding.length() - 1);
- }
- }
- }
+ public String getEncoding() {
return encoding;
}
/**
- * Indicates if the MIME type belongs to the APPLICATION XML family.
+ * Process a HTTP stream.
*
- * @param mime The mime type
- * @return true if the mime type belongs to the APPLICATION XML family,
- * otherwise false
+ * @param bom BOMInputStream to detect byte order marks
+ * @param pis BOMInputStream to guess XML encoding
+ * @param httpContentType The HTTP content type
+ * @param lenient indicates if the charset encoding detection should be
+ * relaxed.
+ * @return the encoding to be used
+ * @throws IOException thrown if there is a problem reading the stream.
*/
- static boolean isAppXml(final String mime) {
- return mime != null &&
- (mime.equals("application/xml") ||
- mime.equals("application/xml-dtd") ||
- mime.equals("application/xml-external-parsed-entity") ||
- mime.startsWith("application/") && mime.endsWith("+xml"));
+ private String processHttpStream(final BOMInputStream bom, final BOMInputStream pis, final String httpContentType,
+ final boolean lenient) throws IOException {
+ final String bomEnc = bom.getBOMCharsetName();
+ final String xmlGuessEnc = pis.getBOMCharsetName();
+ final String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
+ try {
+ return calculateHttpEncoding(httpContentType, bomEnc, xmlGuessEnc, xmlEnc, lenient);
+ } catch (final XmlStreamReaderException ex) {
+ if (lenient) {
+ return doLenientDetection(httpContentType, ex);
+ }
+ throw ex;
+ }
}
/**
- * Indicates if the MIME type belongs to the TEXT XML family.
- *
- * @param mime The mime type
- * @return true if the mime type belongs to the TEXT XML family,
- * otherwise false
+ * Invokes the underlying reader's {@code read(char[], int, int)} method.
+ * @param buf the buffer to read the characters into
+ * @param offset The start offset
+ * @param len The number of bytes to read
+ * @return the number of characters read or -1 if the end of stream
+ * @throws IOException if an I/O error occurs
*/
- static boolean isTextXml(final String mime) {
- return mime != null &&
- (mime.equals("text/xml") ||
- mime.equals("text/xml-external-parsed-entity") ||
- mime.startsWith("text/") && mime.endsWith("+xml"));
+ @Override
+ public int read(final char[] buf, final int offset, final int len) throws IOException {
+ return reader.read(buf, offset, len);
}
- private static final String RAW_EX_1 =
- "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
-
- private static final String RAW_EX_2 =
- "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
-
- private static final String HTTP_EX_1 =
- "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL";
-
- private static final String HTTP_EX_2 =
- "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
-
- private static final String HTTP_EX_3 =
- "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME";
-
}