You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@myfaces.apache.org by lu...@apache.org on 2009/05/28 04:36:01 UTC

svn commit: r779411 - in /myfaces/shared/trunk/core/src/main/java/org/apache/myfaces/shared/renderkit/html: HtmlResponseWriterImpl.java util/HTMLEncoder.java

Author: lu4242
Date: Thu May 28 02:36:00 2009
New Revision: 779411

URL: http://svn.apache.org/viewvc?rev=779411&view=rev
Log:
MYFACES-1841 HtmlResponseWriterImpl.writeURIAttribute does not perform proper URLs encoding ( ex: & should be encoded in &amp)

Modified:
    myfaces/shared/trunk/core/src/main/java/org/apache/myfaces/shared/renderkit/html/HtmlResponseWriterImpl.java
    myfaces/shared/trunk/core/src/main/java/org/apache/myfaces/shared/renderkit/html/util/HTMLEncoder.java

Modified: myfaces/shared/trunk/core/src/main/java/org/apache/myfaces/shared/renderkit/html/HtmlResponseWriterImpl.java
URL: http://svn.apache.org/viewvc/myfaces/shared/trunk/core/src/main/java/org/apache/myfaces/shared/renderkit/html/HtmlResponseWriterImpl.java?rev=779411&r1=779410&r2=779411&view=diff
==============================================================================
--- myfaces/shared/trunk/core/src/main/java/org/apache/myfaces/shared/renderkit/html/HtmlResponseWriterImpl.java (original)
+++ myfaces/shared/trunk/core/src/main/java/org/apache/myfaces/shared/renderkit/html/HtmlResponseWriterImpl.java Thu May 28 02:36:00 2009
@@ -389,7 +389,8 @@
                 }
             }
             */
-            _writer.write(strValue);
+            //_writer.write(strValue);
+            _writer.write(org.apache.myfaces.shared.renderkit.html.util.HTMLEncoder.encodeURIAtributte(strValue, _characterEncoding));
         }
         _writer.write('"');
     }

Modified: myfaces/shared/trunk/core/src/main/java/org/apache/myfaces/shared/renderkit/html/util/HTMLEncoder.java
URL: http://svn.apache.org/viewvc/myfaces/shared/trunk/core/src/main/java/org/apache/myfaces/shared/renderkit/html/util/HTMLEncoder.java?rev=779411&r1=779410&r2=779411&view=diff
==============================================================================
--- myfaces/shared/trunk/core/src/main/java/org/apache/myfaces/shared/renderkit/html/util/HTMLEncoder.java (original)
+++ myfaces/shared/trunk/core/src/main/java/org/apache/myfaces/shared/renderkit/html/util/HTMLEncoder.java Thu May 28 02:36:00 2009
@@ -18,6 +18,11 @@
  */
 package org.apache.myfaces.shared.renderkit.html.util;
 
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+
 /**
  * Converts Strings so that they can be used within HTML-Code.
  */
@@ -146,5 +151,394 @@
         }
     }
 
+    
+    private static final String HEX_CHARSET = "0123456789ABCDEF";
+    
+    private static final String UTF8 = "UTF-8";
+    
+    /**
+     * Encode an URI, escaping or percent-encoding all required characters and
+     * following the rules mentioned on RFC 3986.  
+     * 
+     * @param string
+     * @param encodeNonLatin
+     * @return
+     * @throws IOException
+     */
+    public static String encodeURIAtributte(final String string, final String characterEncoding)
+        throws IOException
+    {
+        StringBuilder sb = null;    //create later on demand
+        String app;
+        char c;
+        boolean endLoop = false;
+        for (int i = 0; i < string.length (); ++i)
+        {
+            app = null;
+            c = string.charAt(i);
+            
+            // This are the guidelines to be taken into account by this algorithm to encode:
+            
+            // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
+            //
+            // control     = <US-ASCII coded characters 00-1F and 7F hexadecimal>
+            // space       = <US-ASCII coded character 20 hexadecimal>
+            // delims      = "<" | ">" | "#" | "%" | <">
+            //               %3C   %3E   %23   %25   %22
+            // unwise      = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
+            //               %7D   %7B   %7C   %5C   %5E   %5B   %5D   %60
+            //
+            // ".... Data corresponding to excluded characters must be escaped in order to
+            // be properly represented within a URI....."
+            
+            // RFC 3986 Section 3.  Syntax Components
+            //
+            // "... The generic URI syntax consists of a hierarchical sequence of
+            // components referred to as the scheme, authority, path, query, and
+            // fragment.
+            //
+            //   URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+            //
+            //   hier-part   = "//" authority path-abempty
+            //               / path-absolute
+            //               / path-rootless
+            //               / path-empty
+            // ...."
+            
+            // RFC 3986 Section 2.2:
+            // Reserved characters (should not be percent-encoded)
+            // reserved    = gen-delims / sub-delims
+            // gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+            //               %3A   %2F   %3F   %23   %5B   %5D   %40
+            // sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
+            //               %21   %24   %26   %27   %28   %29   %2A   %2B   %2C   %3B   %3D
+            
+            // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
+            // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) 
+            // "...those rules were redefined to directly specify the characters allowed...."
+            // There is also other characters moved from excluded list to reserved:
+            // "[" / "]" / "#"  
+            
+            // RFC 3986 Section 2.3:
+            // "... for consistency, percent-encoded octets in the ranges of ALPHA
+            // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
+            // underscore (%5F), or tilde (%7E) should not be created by URI
+            // producers...."
+            
+            // RFC 3986 Section  3.2.2.  Host
+
+            // host = IP-literal / IPv4address / reg-name
 
+            // The reg-name syntax allows percent-encoded octets in order to
+            // represent non-ASCII registered names in a uniform way that is
+            // independent of the underlying name resolution technology.  Non-ASCII
+            // characters must first be encoded according to UTF-8 [STD63], and then
+            // each octet of the corresponding UTF-8 sequence must be percent-
+            // encoded to be represented as URI characters.  URI producing
+            // applications must not use percent-encoding in host unless it is used
+            // to represent a UTF-8 character sequence.
+            
+            // RFC 3986 Section 3.4 Query 
+            //         query       = *( pchar / "/" / "?" )
+            //
+            // "...  However, as query components are often used to carry identifying information 
+            // in the form of "key=value" pairs and one frequently used value is a reference to
+            // another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
+            //
+            // RFC 3986 Section 2.5 Identifying Data (Apply to query section)
+            //
+            // When a new URI scheme defines a component that represents textual
+            // data consisting of characters from the Universal Character Set [UCS],
+            // the data should first be encoded as octets according to the UTF-8
+            // character encoding [STD63]; then only those octets that do not
+            // correspond to characters in the unreserved set should be percent-
+            // encoded.  For example, the character A would be represented as "A",
+            // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
+            // as "%C3%80", and the character KATAKANA LETTER A would be represented
+            // as "%E3%82%A2".
+            //
+            // RFC 3986 Section 3.5 Fragment
+            //         fragment    = *( pchar / "/" / "?" )
+            //
+            // Note that follows the same as query
+            
+            // Based on the extracts the strategy to apply on this method is:
+            // 
+            // On scheme ":" hier-part
+            //
+            // Escape or percent encode chars inside :
+            // 
+            // - From %00 to %20, 
+            // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
+            //                     duplicate encoding, encode it when we are sure 
+            //                     that there are not encoded twice)
+            // - "<" %3C, ">" %3E
+            // - "\" %5C, "^" %5E, "`" %60 
+            // - "{" %7B, "|" %7C, "}" %7D
+            // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
+            //   part of an URI, but it is preferred to encode it that omit it).
+            //
+            // The remaining characters must not be encoded
+            //
+            // Characters after ? or # should be percent encoding but only the necessary ones:
+            //
+            // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
+            // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
+            //                     duplicate encoding, encode it when we are sure 
+            //                     that there are not encoded twice)
+            // - "<" %3C, ">" %3E,
+            // - "\" %5C, "^" %5E, "`" %60 
+            // - "{" %7B, "|" %7C, "}" %7D
+            // - From %7F ad infinitum (each character as many bytes as necessary but take into account
+            //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
+            //   translating from the document character encoding to percent encoding, because this values
+            //   could be retrieved from httpRequest.getParameter() and it uses the current character encoding
+            //   for decode values)
+            //
+            // "&" should be encoded as "&amp;" because this link is inside an html page, and 
+            // put only & is invalid in this context.
+
+            if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
+                    c == '"' || c == '<' ||
+                    c == '>' || c == '\\' || c == '^' || c == '`' ||
+                    c == '{' || c == '|' || c == '}')
+            {
+                // The percent encoding on this part should be done using UTF-8 charset
+                // as RFC 3986 Section 3.2.2 says.
+                // Also there is a reference on 
+                // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
+                // that recommend use of UTF-8 instead the document character encoding.
+                // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
+                app = percentEncode(c, "UTF-8");
+            }
+            else if (c == '%')
+            {
+                if (i + 2 < string.length())
+                {
+                    char c1 = string.charAt(i+1);
+                    char c2 = string.charAt(i+2);
+                    if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
+                        (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
+                    {
+                        // do not percent encode, because it could be already encoded
+                        // and we don't want encode it twice
+                    }
+                    else
+                    {
+                        app = percentEncode(c, UTF8);
+                    }
+                }
+                else
+                {
+                    app = percentEncode(c, UTF8);
+                }
+            }
+            else if (c == '?' || c == '#')
+            {
+                if (i+1 < string.length())
+                {
+                    // The remaining part of the URI are data that should be encoded
+                    // using the document character encoding.
+                    app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
+                    endLoop = true;
+                }
+            }
+            else
+            {
+                //No encoding, just do nothing, char will be added later.
+            }
+                        
+            if (app != null)
+            {
+                if (sb == null)
+                {
+                    sb = new StringBuilder(string.substring(0, i));
+                }
+                sb.append(app);
+            } else {
+                if (sb != null)
+                {
+                    sb.append(c);
+                }
+            }
+            if (endLoop)
+            {
+                break;
+            }
+        }
+        if (sb == null)
+        {
+            return string;
+        }
+        else
+        {
+            return sb.toString();
+        }
+    }
+    
+    /**
+     * Encode a unicode char value in percentEncode, decoding its bytes using a specified 
+     * characterEncoding.
+     * 
+     * @param c
+     * @param characterEncoding
+     * @return
+     */
+    private static String percentEncode(char c, String characterEncoding)
+    {
+        String app = null;
+        if (c > (char)((short)0x007F))
+        {
+            //percent encode in the proper encoding to be consistent
+            app = percentEncodeNonUsAsciiCharacter(c, characterEncoding);
+        }
+        else
+        {
+            //percent encode US-ASCII char (0x00-0x7F range)
+            app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10);
+        }
+        return app;
+    }
+    
+    private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding)
+    {
+        ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
+        StringBuffer builder = new StringBuffer();
+        try
+        {
+            OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
+            writer.write(c);
+            writer.flush();
+        }
+        catch(IOException e)
+        {
+            baos.reset();
+            return null;
+        }
+        
+        byte [] byteArray =  baos.toByteArray();
+        for (int i=0; i < byteArray.length; i++)
+        {
+            builder.append('%');
+            builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
+            builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
+        }
+        
+        return builder.toString();
+    }
+
+    /**
+     * Encode the query part using the document charset encoding provided.
+     * 
+     * 
+     * @param string
+     * @param characterEncoding
+     * @return
+     */
+    private static String encodeURIQuery(final String string, final String characterEncoding)
+    {
+        StringBuilder sb = null;    //create later on demand
+        String app;
+        char c;
+        boolean endLoop = false;
+        for (int i = 0; i < string.length (); ++i)
+        {
+            app = null;
+            c = string.charAt(i);
+            
+            // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
+            // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so we make easier and omit this one)
+            // - "<" %3C, ">" %3E,
+            // - "\" %5C, "^" %5E, "`" %60 
+            // - "{" %7B, "|" %7C, "}" %7D
+            // - From %7F ad infinitum (each character as many bytes as necessary but take into account
+            //   that a single char should contain 2,3 or more bytes!. This data should be encoded translating from the document
+            //   character encoding to percent encoding)
+            //
+            // "&" should be encoded as "&amp;" because this link is inside an html page, and 
+            // put & is invalid in this context   
+            
+            if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
+                    c == '"' || c == '<' ||
+                    c == '>' || c == '\\' || c == '^' || c == '`' ||
+                    c == '{' || c == '|' || c == '}')
+            {
+                // The percent encoding on this part should be done using UTF-8 charset
+                // as RFC 3986 Section 3.2.2 says
+                app = percentEncode(c, characterEncoding);
+            }
+            else if (c == '%')
+            {
+                if (i + 2 < string.length())
+                {
+                    char c1 = string.charAt(i+1);
+                    char c2 = string.charAt(i+2);
+                    if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
+                        (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
+                    {
+                        // do not percent encode, because it could be already encoded
+                    }
+                    else
+                    {
+                        app = percentEncode(c, characterEncoding);
+                    }
+                }
+                else
+                {
+                    app = percentEncode(c, characterEncoding);
+                }
+            }
+            else if (c == '&')
+            {
+                if (i+4 < string.length() )
+                {
+                    if ('a' == string.charAt(i+1) &&
+                        'm' == string.charAt(i+2) &&
+                        'p' == string.charAt(i+3) &&
+                        ';' == string.charAt(i+4))
+                    {
+                        //Skip
+                    }
+                    else
+                    {
+                        app = "&amp;";
+                    }
+                }
+                else
+                {
+                    app = "&amp;";
+                }
+            }
+            else
+            {
+                //No encoding, just do nothing, char will be added later.
+            }
+                        
+            if (app != null)
+            {
+                if (sb == null)
+                {
+                    sb = new StringBuilder(string.substring(0, i));
+                }
+                sb.append(app);
+            } else {
+                if (sb != null)
+                {
+                    sb.append(c);
+                }
+            }
+            if (endLoop)
+            {
+                break;
+            }
+        }
+        if (sb == null)
+        {
+            return string;
+        }
+        else
+        {
+            return sb.toString();
+        }
+    }
 }