You are viewing a plain text version of this content. The canonical link for it is here.
Posted to slide-dev@jakarta.apache.org by je...@apache.org on 2002/08/15 17:15:58 UTC
cvs commit: jakarta-slide/src/util/org/apache/util URI.java
jericho 2002/08/15 08:15:58
Modified: src/util/org/apache/util URI.java
Log:
- Couple he two protected methods
(encode and escape, decode and unescape)
- Fix the encoding bug.
- Add an easy way to support old character encodings
from servlets.com distribution
- Make static methods for setting and getting charset.
Revision Changes Path
1.11 +147 -122 jakarta-slide/src/util/org/apache/util/URI.java
Index: URI.java
===================================================================
RCS file: /home/cvs/jakarta-slide/src/util/org/apache/util/URI.java,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -r1.10 -r1.11
--- URI.java 25 Jul 2002 11:00:52 -0000 1.10
+++ URI.java 15 Aug 2002 15:15:58 -0000 1.11
@@ -66,7 +66,9 @@
import java.io.IOException;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
+import java.util.Locale;
import java.util.BitSet;
+import java.util.Hashtable;
import java.security.AccessController;
import java.security.PrivilegedAction;
import sun.security.action.GetBooleanAction;
@@ -128,19 +130,17 @@
* Please, notice that there are many modifications from URL(RFC 1738) and
* relative URL(RFC 1808).
* <p>
- * <b>The recommendation to use the URI class</b>
+ * <b>The expressions for a URI</b>
* <p><pre>
- * Communication function level
- * - URI(<code>char</code>[]) // constructor
- * - <code>char</code>[] getRawXxx() // method
+ * For escaped URI forms
+ * - URI(char[]) // constructor
+ * - char[] getRawXxx() // method
+ * - String getEscapedXxx() // method
+ * - String toString() // method
* <p>
- * Document and data processing function level
- * - URI(<code>String</code>) // constructor
- * - <code>String</code> getXXX() // method
- * <p>
- * Both Level
- * - <code>String</code> getEscapedXxx() // method
- * - <code>String</code> toString() // method
+ * For unescaped URI forms
+ * - URI(String) // constructor
+ * - String getXXX() // method
* </pre><p>
*
* @author <a href="mailto:jericho@apache.org">Sung-Gu</a>
@@ -471,9 +471,14 @@
protected static String _documentCharset = null;
// Static initializer for _documentCharset
static {
- _documentCharset = (String)AccessController.doPrivileged (
- new GetPropertyAction("file.encoding")
- );
+ Locale locale = Locale.getDefault();
+ if (locale != null) {
+ // in order to support backward compatiblity
+ _documentCharset = LocaleToCharsetMap.getCharset(locale);
+ } else {
+ _documentCharset = (String)AccessController.doPrivileged(
+ new GetPropertyAction("file.encoding"));
+ }
}
/**
@@ -1366,6 +1371,11 @@
* original character sequence->octet sequence->URI character sequence
* </pre></blockquote><p>
*
+ * An escaped octet is encoded as a character triplet, consisting of the
+ * percent character "%" followed by the two hexadecimal digits
+ * representing the octet code. For example, "%20" is the escaped
+ * encoding for the US-ASCII space character.
+ * <p>
* Conversion from the local filesystem character set to UTF-8 will
* normally involve a two step process. First convert the local character
* set to the UCS; then convert the UCS to UTF-8.
@@ -1395,89 +1405,23 @@
if (original == null) {
throw new URIException("original");
}
- byte[] octet = original.getBytes(_documentCharset);
- return escape(octet, allowed);
- }
-
-
- /**
- * This is a two mapping, one from URI characters to octets, and
- * subsequently a second from octets to original characters:
- * <p><blockquote><pre>
- * URI character sequence->octet sequence->original character sequence
- * </pre></blockquote><p>
- *
- * A URI must be separated into its components before the escaped
- * characters within those components can be allowedly decoded.
- * <p>
- * Notice that there is a chance that URI characters that are non UTF-8
- * may be parsed as valid UTF-8. A recent non-scientific analysis found
- * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
- * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
- * false reading.
- * <p>
- * The unescape method is internally performed within this method.
- *
- * @param octet the octet sequence
- * @return original character sequence
- * @exception UnsupportedEncodingException unsupported character encoding
- * @exception URIException incomplete trailing escape pattern
- * @throws NullPointerException null argument
- * @see #unescape
- */
- protected String decode(char[] uri)
- throws UnsupportedEncodingException, URIException {
-
- // decode uri to original characters.
- return new String(unescape(uri), _documentCharset);
- }
-
-
- /**
- * This is a mapping from octets to URI characters:
- * <p><blockquote><pre>
- * octet sequence->URI character sequence
- * </pre></blockquote><p>
- *
- * An escaped octet is encoded as a character triplet, consisting of the
- * percent character "%" followed by the two hexadecimal digits
- * representing the octet code. For example, "%20" is the escaped
- * encoding for the US-ASCII space character.
- *
- * @param octet the octet sequence to be escaped
- * @param allowed those characters that are allowed within a component
- * @return URI character sequence
- * @exception UnsupportedEncodingException unsupported character encoding
- * @exception URIException
- */
- protected char[] escape(byte[] octet, BitSet allowed)
- throws UnsupportedEncodingException, URIException {
-
// escape octet to uri characters.
- if (octet == null) {
- throw new URIException("null octets");
- }
if (allowed == null) {
throw new URIException("null allowed characters");
}
- String octets = new String(octet, _protocolCharset);
- char[] preuric = new char[octets.length()];
- if (octet.length == 0) {
- return preuric; // defined, but empty
- }
- octets.getChars(0, octets.length(), preuric, 0);
- StringBuffer buf = new StringBuffer(preuric.length);
- for (int i = 0; i < preuric.length; i++) {
- char c = (char) preuric[i];
+ byte[] octets = original.getBytes(_protocolCharset);
+ StringBuffer buf = new StringBuffer(octets.length);
+ for (int i = 0; i < octets.length; i++) {
+ char c = (char) octets[i];
if (allowed.get(c)) {
buf.append(c);
} else {
- byte b = (byte) c;
buf.append('%');
+ byte b = octets[i];
char hexadecimal = Character.forDigit((b >> 4) & 0xF, 16);
- buf.append(hexadecimal);
+ buf.append(Character.toUpperCase(hexadecimal)); // high
hexadecimal = Character.forDigit(b & 0xF, 16);
- buf.append(hexadecimal);
+ buf.append(Character.toUpperCase(hexadecimal)); // low
}
}
@@ -1486,45 +1430,57 @@
/**
- * This is a mapping from URI characters to octets:
+ * This is a two mapping, one from URI characters to octets, and
+ * subsequently a second from octets to original characters:
* <p><blockquote><pre>
- * URI character sequence->octet sequence
+ * URI character sequence->octet sequence->original character sequence
* </pre></blockquote><p>
*
+ * A URI must be separated into its components before the escaped
+ * characters within those components can be allowedly decoded.
+ * <p>
+ * Notice that there is a chance that URI characters that are non UTF-8
+ * may be parsed as valid UTF-8. A recent non-scientific analysis found
+ * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
+ * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
+ * false reading.
+ * <p>
* The percent "%" character always has the reserved purpose of being
* the escape indicator, it must be escaped as "%25" in order to be used
* as data within a URI.
+ * <p>
+ * The unescape method is internally performed within this method.
*
* @param uri the URI character sequence
- * @return octet sequence
+ * @return original character sequence
* @exception UnsupportedEncodingException unsupported character encoding
* @exception URIException incomplete trailing escape pattern
* @throws NullPointerException null argument
*/
- protected byte[] unescape(char[] uri)
+ protected String decode(char[] uri)
throws UnsupportedEncodingException, URIException {
// unescape uri characters to octets
- if (uri == null) {
- throw new URIException("uri");
- }
- byte[] octet = new String(uri).getBytes(_protocolCharset);
+ if (uri == null) return null;
+
+ byte[] octets = new String(uri).getBytes(_protocolCharset);
+ int length = octets.length;
int oi = 0; // output index
- for (int ii = 0; ii < uri.length; oi++) {
- byte b = (byte) octet[ii++];
- if (b == '%') {
- b = (byte) ((Character.digit((char) octet[ii++], 16) << 4) +
- Character.digit((char) octet[ii++], 16));
- if (b == -1) {
+ for (int ii = 0; ii < length; oi++) {
+ byte aByte = (byte) octets[ii++];
+ if (aByte == '%' && ii+2 <= length) {
+ byte high = (byte) Character.digit((char) octets[ii++], 16);
+ byte low = (byte) Character.digit((char) octets[ii++], 16);
+ if (high == -1 || low == -1) {
throw new URIException(
"incomplete trailing escape pattern");
}
+ aByte = (byte) ((high << 4) + low);
}
- octet[oi] = (byte) b;
+ octets[oi] = (byte) aByte;
}
- byte[] result = new byte[oi];
- System.arraycopy(octet, 0, result, 0, oi);
- return result;
+
+ return new String(octets, 0, oi, _protocolCharset);
}
@@ -1641,14 +1597,6 @@
String tmp = original.trim();
/**
- * Consider of the character encoding of the document.
- * The platform's charset is used for the document by default.
- */
- if (_documentCharset != null && !escaped) {
- tmp = new String(tmp.getBytes(_documentCharset), _documentCharset);
- }
-
- /**
* The starting index
*/
int from = 0;
@@ -2000,7 +1948,7 @@
}
if (_opaque != null && _is_opaque_part) {
buf.append(_opaque);
- } else if (_path != null) { // && _path.length != 0) {
+ } else if (_path != null) {
// _is_hier_part or _is_relativeURI
if (_path.length != 0) {
buf.append(_path);
@@ -2203,7 +2151,7 @@
*
* @param charset the default charset for each protocol
*/
- public void setProtocolCharset(String charset) {
+ public static void setProtocolCharset(String charset) {
_protocolCharset = charset;
}
@@ -2222,7 +2170,7 @@
*
* @return the charset string
*/
- public String getProtocolCharset() {
+ public static String getProtocolCharset() {
return _protocolCharset;
}
@@ -2238,7 +2186,7 @@
*
* @param charset the default charset for the document
*/
- public void setDocumentCharset(String charset) {
+ public static void setDocumentCharset(String charset) {
_documentCharset = charset;
}
@@ -2248,7 +2196,7 @@
*
* @return the charset string
*/
- public String getDocumentCharset() {
+ public static String getDocumentCharset() {
return _documentCharset;
}
@@ -3183,4 +3131,81 @@
return getEscapedURI();
}
+
+ // ------------------------------------------------------------ Inner class
+
+ /**
+ * A mapping to determine the (somewhat arbitrarily) preferred charset for
+ * a given locale. Supports all locales recognized in JDK 1.1.
+ * <p>
+ * The distribution of this class is Servlets.com. It was originally
+ * written by Jason Hunter [jhunter@acm.org] and used by the Jakarta Slide
+ * project with permission.
+ */
+ public static class LocaleToCharsetMap {
+
+ private static Hashtable map;
+ static {
+ map = new Hashtable();
+ map.put("ar", "ISO-8859-6");
+ map.put("be", "ISO-8859-5");
+ map.put("bg", "ISO-8859-5");
+ map.put("ca", "ISO-8859-1");
+ map.put("cs", "ISO-8859-2");
+ map.put("da", "ISO-8859-1");
+ map.put("de", "ISO-8859-1");
+ map.put("el", "ISO-8859-7");
+ map.put("en", "ISO-8859-1");
+ map.put("es", "ISO-8859-1");
+ map.put("et", "ISO-8859-1");
+ map.put("fi", "ISO-8859-1");
+ map.put("fr", "ISO-8859-1");
+ map.put("hr", "ISO-8859-2");
+ map.put("hu", "ISO-8859-2");
+ map.put("is", "ISO-8859-1");
+ map.put("it", "ISO-8859-1");
+ map.put("iw", "ISO-8859-8");
+ map.put("ja", "Shift_JIS");
+ map.put("ko", "EUC-KR");
+ map.put("lt", "ISO-8859-2");
+ map.put("lv", "ISO-8859-2");
+ map.put("mk", "ISO-8859-5");
+ map.put("nl", "ISO-8859-1");
+ map.put("no", "ISO-8859-1");
+ map.put("pl", "ISO-8859-2");
+ map.put("pt", "ISO-8859-1");
+ map.put("ro", "ISO-8859-2");
+ map.put("ru", "ISO-8859-5");
+ map.put("sh", "ISO-8859-5");
+ map.put("sk", "ISO-8859-2");
+ map.put("sl", "ISO-8859-2");
+ map.put("sq", "ISO-8859-2");
+ map.put("sr", "ISO-8859-5");
+ map.put("sv", "ISO-8859-1");
+ map.put("tr", "ISO-8859-9");
+ map.put("uk", "ISO-8859-5");
+ map.put("zh", "GB2312");
+ map.put("zh_TW", "Big5");
+ }
+
+ /**
+ * Get the preferred charset for the given locale.
+ *
+ * @param locale the locale
+ * @return the preferred charset
+ * or null if the locale is not recognized
+ */
+ public static String getCharset(Locale locale) {
+ // try for an full name match (may include country)
+ String charset = (String) map.get(locale.toString());
+ if (charset != null) return charset;
+
+ // if a full name didn't match, try just the language
+ charset = (String) map.get(locale.getLanguage());
+ return charset; // may be null
+ }
+
+ }
+
}
+
--
To unsubscribe, e-mail: <ma...@jakarta.apache.org>
For additional commands, e-mail: <ma...@jakarta.apache.org>