You are viewing a plain text version of this content. The canonical link for it is here.
Posted to slide-dev@jakarta.apache.org by je...@apache.org on 2002/08/15 17:15:58 UTC

cvs commit: jakarta-slide/src/util/org/apache/util URI.java

jericho     2002/08/15 08:15:58

  Modified:    src/util/org/apache/util URI.java
  Log:
  - Couple he two protected methods
    (encode and escape, decode and unescape)
  - Fix the encoding bug.
  - Add an easy way to support old character encodings
    from servlets.com distribution
  - Make static methods for setting and getting charset.
  
  Revision  Changes    Path
  1.11      +147 -122  jakarta-slide/src/util/org/apache/util/URI.java
  
  Index: URI.java
  ===================================================================
  RCS file: /home/cvs/jakarta-slide/src/util/org/apache/util/URI.java,v
  retrieving revision 1.10
  retrieving revision 1.11
  diff -u -r1.10 -r1.11
  --- URI.java	25 Jul 2002 11:00:52 -0000	1.10
  +++ URI.java	15 Aug 2002 15:15:58 -0000	1.11
  @@ -66,7 +66,9 @@
   import java.io.IOException;
   import java.io.Serializable;
   import java.io.UnsupportedEncodingException;
  +import java.util.Locale;
   import java.util.BitSet;
  +import java.util.Hashtable;
   import java.security.AccessController;
   import java.security.PrivilegedAction;
   import sun.security.action.GetBooleanAction;
  @@ -128,19 +130,17 @@
    * Please, notice that there are many modifications from URL(RFC 1738) and
    * relative URL(RFC 1808).
    * <p>
  - * <b>The recommendation to use the URI class</b>
  + * <b>The expressions for a URI</b>
    * <p><pre>
  - * Communication function level
  - *  - URI(<code>char</code>[]) // constructor
  - *  - <code>char</code>[] getRawXxx() // method
  + * For escaped URI forms
  + *  - URI(char[]) // constructor
  + *  - char[] getRawXxx() // method
  + *  - String getEscapedXxx() // method
  + *  - String toString() // method
    * <p>
  - * Document and data processing function level
  - *  - URI(<code>String</code>) // constructor
  - *  - <code>String</code> getXXX() // method
  - * <p>
  - * Both Level
  - *  - <code>String</code> getEscapedXxx() // method
  - *  - <code>String</code> toString() // method
  + * For unescaped URI forms
  + *  - URI(String) // constructor
  + *  - String getXXX() // method
    * </pre><p>
    *
    * @author <a href="mailto:jericho@apache.org">Sung-Gu</a>
  @@ -471,9 +471,14 @@
       protected static String _documentCharset = null;
       // Static initializer for _documentCharset
       static {
  -        _documentCharset = (String)AccessController.doPrivileged (
  -            new GetPropertyAction("file.encoding")
  -        );
  +        Locale locale = Locale.getDefault();
  +        if (locale != null) {
  +            // in order to support backward compatiblity
  +            _documentCharset = LocaleToCharsetMap.getCharset(locale);
  +        } else {
  +            _documentCharset = (String)AccessController.doPrivileged(
  +                    new GetPropertyAction("file.encoding"));
  +        }
       }
   
       /**
  @@ -1366,6 +1371,11 @@
        *   original character sequence->octet sequence->URI character sequence
        * </pre></blockquote><p>
        *
  +     * An escaped octet is encoded as a character triplet, consisting of the
  +     * percent character "%" followed by the two hexadecimal digits
  +     * representing the octet code. For example, "%20" is the escaped
  +     * encoding for the US-ASCII space character.
  +     * <p>
        * Conversion from the local filesystem character set to UTF-8 will
        * normally involve a two step process. First convert the local character
        * set to the UCS; then convert the UCS to UTF-8.
  @@ -1395,89 +1405,23 @@
           if (original == null) {
               throw new URIException("original");
           }
  -        byte[] octet = original.getBytes(_documentCharset);
  -        return escape(octet, allowed);
  -    }
  -
  -
  -    /**
  -     * This is a two mapping, one from URI characters to octets, and
  -     * subsequently a second from octets to original characters:
  -     * <p><blockquote><pre>
  -     *   URI character sequence->octet sequence->original character sequence
  -     * </pre></blockquote><p>
  -     *
  -     * A URI must be separated into its components before the escaped
  -     * characters within those components can be allowedly decoded.
  -     * <p>
  -     * Notice that there is a chance that URI characters that are non UTF-8
  -     * may be parsed as valid UTF-8.  A recent non-scientific analysis found
  -     * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
  -     * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
  -     * false reading.
  -     * <p>
  -     * The unescape method is internally performed within this method.
  -     *
  -     * @param octet the octet sequence
  -     * @return original character sequence
  -     * @exception UnsupportedEncodingException unsupported character encoding
  -     * @exception URIException incomplete trailing escape pattern
  -     * @throws NullPointerException null argument
  -     * @see #unescape
  -     */
  -    protected String decode(char[] uri)
  -        throws UnsupportedEncodingException, URIException {
  -
  -        // decode uri to original characters.
  -        return new String(unescape(uri), _documentCharset);
  -    }
  -
  -
  -    /**
  -     * This is a mapping from octets to URI characters:
  -     * <p><blockquote><pre>
  -     *   octet sequence->URI character sequence
  -     * </pre></blockquote><p>
  -     *
  -     * An escaped octet is encoded as a character triplet, consisting of the
  -     * percent character "%" followed by the two hexadecimal digits
  -     * representing the octet code. For example, "%20" is the escaped
  -     * encoding for the US-ASCII space character.
  -     *
  -     * @param octet the octet sequence to be escaped
  -     * @param allowed those characters that are allowed within a component
  -     * @return URI character sequence
  -     * @exception UnsupportedEncodingException unsupported character encoding
  -     * @exception URIException
  -     */
  -    protected char[] escape(byte[] octet, BitSet allowed)
  -        throws UnsupportedEncodingException, URIException {
  -
           // escape octet to uri characters.
  -        if (octet == null) {
  -            throw new URIException("null octets");
  -        }
           if (allowed == null) {
               throw new URIException("null allowed characters");
           }
  -        String octets = new String(octet, _protocolCharset);
  -        char[] preuric = new char[octets.length()];
  -        if (octet.length == 0) {
  -            return preuric;  // defined, but empty
  -        }
  -        octets.getChars(0, octets.length(), preuric, 0);
  -        StringBuffer buf = new StringBuffer(preuric.length);
  -        for (int i = 0; i < preuric.length; i++) {
  -            char c = (char) preuric[i];
  +        byte[] octets = original.getBytes(_protocolCharset);
  +        StringBuffer buf = new StringBuffer(octets.length);
  +        for (int i = 0; i < octets.length; i++) {
  +            char c = (char) octets[i];
               if (allowed.get(c)) {
                   buf.append(c);
               } else {
  -                byte b = (byte) c;
                   buf.append('%');
  +                byte b = octets[i];
                   char hexadecimal = Character.forDigit((b >> 4) & 0xF, 16);
  -                buf.append(hexadecimal);
  +                buf.append(Character.toUpperCase(hexadecimal)); // high
                   hexadecimal = Character.forDigit(b & 0xF, 16);
  -                buf.append(hexadecimal);
  +                buf.append(Character.toUpperCase(hexadecimal)); // low
               }
           }
   
  @@ -1486,45 +1430,57 @@
   
   
       /**
  -     * This is a mapping from URI characters to octets:
  +     * This is a two mapping, one from URI characters to octets, and
  +     * subsequently a second from octets to original characters:
        * <p><blockquote><pre>
  -     *   URI character sequence->octet sequence
  +     *   URI character sequence->octet sequence->original character sequence
        * </pre></blockquote><p>
        *
  +     * A URI must be separated into its components before the escaped
  +     * characters within those components can be allowedly decoded.
  +     * <p>
  +     * Notice that there is a chance that URI characters that are non UTF-8
  +     * may be parsed as valid UTF-8.  A recent non-scientific analysis found
  +     * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
  +     * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
  +     * false reading.
  +     * <p>
        * The percent "%" character always has the reserved purpose of being
        * the escape indicator, it must be escaped as "%25" in order to be used
        * as data within a URI.
  +     * <p>
  +     * The unescape method is internally performed within this method.
        *
        * @param uri the URI character sequence
  -     * @return octet sequence
  +     * @return original character sequence
        * @exception UnsupportedEncodingException unsupported character encoding
        * @exception URIException incomplete trailing escape pattern
        * @throws NullPointerException null argument
        */
  -    protected byte[] unescape(char[] uri)
  +    protected String decode(char[] uri)
           throws UnsupportedEncodingException, URIException {
   
           // unescape uri characters to octets
  -        if (uri == null) {
  -            throw new URIException("uri");
  -        }
  -        byte[] octet = new String(uri).getBytes(_protocolCharset);
  +        if (uri == null)  return null;
  +
  +        byte[] octets = new String(uri).getBytes(_protocolCharset);
  +        int length = octets.length;
           int oi = 0; // output index
  -        for (int ii = 0; ii < uri.length; oi++) {
  -            byte b = (byte) octet[ii++];
  -            if (b == '%') {
  -                b = (byte) ((Character.digit((char) octet[ii++], 16) << 4) +
  -                Character.digit((char) octet[ii++], 16));
  -                if (b == -1) {
  +        for (int ii = 0; ii < length; oi++) {
  +            byte aByte = (byte) octets[ii++];
  +            if (aByte == '%' && ii+2 <= length)  {
  +                byte high = (byte) Character.digit((char) octets[ii++], 16);
  +                byte low = (byte) Character.digit((char) octets[ii++], 16);
  +                if (high == -1 || low == -1) {
                       throw new URIException(
                               "incomplete trailing escape pattern");
                   }
  +                aByte = (byte) ((high << 4) + low);
               }
  -            octet[oi] = (byte) b;
  +            octets[oi] = (byte) aByte;
           }
  -        byte[] result = new byte[oi];
  -        System.arraycopy(octet, 0, result, 0, oi);
  -        return result;
  +
  +        return new String(octets, 0, oi, _protocolCharset);
       }
   
   
  @@ -1641,14 +1597,6 @@
           String tmp = original.trim();
   
           /**
  -         * Consider of the character encoding of the document.
  -         * The platform's charset is used for the document by default.
  -         */
  -        if (_documentCharset != null && !escaped) {
  -            tmp = new String(tmp.getBytes(_documentCharset), _documentCharset);
  -        }
  -
  -        /**
            * The starting index
            */
           int from = 0;
  @@ -2000,7 +1948,7 @@
           }
           if (_opaque != null && _is_opaque_part) {
               buf.append(_opaque);
  -        } else if (_path != null) { //  && _path.length != 0) {
  +        } else if (_path != null) {
               // _is_hier_part or _is_relativeURI
               if (_path.length != 0) {
                   buf.append(_path);
  @@ -2203,7 +2151,7 @@
        *
        * @param charset the default charset for each protocol
        */
  -    public void setProtocolCharset(String charset) {
  +    public static void setProtocolCharset(String charset) {
           _protocolCharset = charset;
       }
   
  @@ -2222,7 +2170,7 @@
        *
        * @return the charset string
        */
  -    public String getProtocolCharset() {
  +    public static String getProtocolCharset() {
           return _protocolCharset;
       }
   
  @@ -2238,7 +2186,7 @@
        *
        * @param charset the default charset for the document
        */
  -    public void setDocumentCharset(String charset) {
  +    public static void setDocumentCharset(String charset) {
           _documentCharset = charset;
       }
   
  @@ -2248,7 +2196,7 @@
        *
        * @return the charset string
        */
  -    public String getDocumentCharset() {
  +    public static String getDocumentCharset() {
           return _documentCharset;
       }
   
  @@ -3183,4 +3131,81 @@
           return getEscapedURI();
       }
   
  +
  +    // ------------------------------------------------------------ Inner class
  +
  +    /** 
  +     * A mapping to determine the (somewhat arbitrarily) preferred charset for 
  +     * a given locale.  Supports all locales recognized in JDK 1.1.
  +     * <p>
  +     * The distribution of this class is Servlets.com.    It was originally
  +     * written by Jason Hunter [jhunter@acm.org] and used by the Jakarta Slide
  +     * project with permission.
  +     */
  +    public static class LocaleToCharsetMap {
  +
  +        private static Hashtable map;
  +        static {
  +            map = new Hashtable();
  +            map.put("ar", "ISO-8859-6");
  +            map.put("be", "ISO-8859-5");
  +            map.put("bg", "ISO-8859-5");
  +            map.put("ca", "ISO-8859-1");
  +            map.put("cs", "ISO-8859-2");
  +            map.put("da", "ISO-8859-1");
  +            map.put("de", "ISO-8859-1");
  +            map.put("el", "ISO-8859-7");
  +            map.put("en", "ISO-8859-1");
  +            map.put("es", "ISO-8859-1");
  +            map.put("et", "ISO-8859-1");
  +            map.put("fi", "ISO-8859-1");
  +            map.put("fr", "ISO-8859-1");
  +            map.put("hr", "ISO-8859-2");
  +            map.put("hu", "ISO-8859-2");
  +            map.put("is", "ISO-8859-1");
  +            map.put("it", "ISO-8859-1");
  +            map.put("iw", "ISO-8859-8");
  +            map.put("ja", "Shift_JIS");
  +            map.put("ko", "EUC-KR");
  +            map.put("lt", "ISO-8859-2");
  +            map.put("lv", "ISO-8859-2");
  +            map.put("mk", "ISO-8859-5");
  +            map.put("nl", "ISO-8859-1");
  +            map.put("no", "ISO-8859-1");
  +            map.put("pl", "ISO-8859-2");
  +            map.put("pt", "ISO-8859-1");
  +            map.put("ro", "ISO-8859-2");
  +            map.put("ru", "ISO-8859-5");
  +            map.put("sh", "ISO-8859-5");
  +            map.put("sk", "ISO-8859-2");
  +            map.put("sl", "ISO-8859-2");
  +            map.put("sq", "ISO-8859-2");
  +            map.put("sr", "ISO-8859-5");
  +            map.put("sv", "ISO-8859-1");
  +            map.put("tr", "ISO-8859-9");
  +            map.put("uk", "ISO-8859-5");
  +            map.put("zh", "GB2312");
  +            map.put("zh_TW", "Big5");
  +        }
  +       
  +        /**
  +         * Get the preferred charset for the given locale.
  +         *
  +         * @param locale the locale
  +         * @return the preferred charset
  +         * or null if the locale is not recognized
  +         */
  +        public static String getCharset(Locale locale) {
  +            // try for an full name match (may include country)
  +            String charset = (String) map.get(locale.toString());
  +            if (charset != null) return charset;
  +           
  +            // if a full name didn't match, try just the language
  +            charset = (String) map.get(locale.getLanguage());
  +            return charset;  // may be null
  +        }
  +
  +    }
  +
   }
  +
  
  
  

--
To unsubscribe, e-mail:   <ma...@jakarta.apache.org>
For additional commands, e-mail: <ma...@jakarta.apache.org>