You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@myfaces.apache.org by lu...@apache.org on 2009/05/28 04:36:25 UTC
svn commit: r779412 - in /myfaces/shared/trunk_3.0.x/core/src: main/java/org/apache/myfaces/shared/renderkit/html/ main/java/org/apache/myfaces/shared/renderkit/html/util/ test/java/org/apache/myfaces/shared/renderkit/html/util/

Author: lu4242
Date: Thu May 28 02:36:25 2009
New Revision: 779412

URL: http://svn.apache.org/viewvc?rev=779412&view=rev
Log:
MYFACES-1841 HtmlResponseWriterImpl.writeURIAttribute does not perform proper URLs encoding ( ex: & should be encoded in &amp)

Modified:
    myfaces/shared/trunk_3.0.x/core/src/main/java/org/apache/myfaces/shared/renderkit/html/HtmlResponseWriterImpl.java
    myfaces/shared/trunk_3.0.x/core/src/main/java/org/apache/myfaces/shared/renderkit/html/util/HTMLEncoder.java
    myfaces/shared/trunk_3.0.x/core/src/test/java/org/apache/myfaces/shared/renderkit/html/util/HTMLEncoderTest.java

Modified: myfaces/shared/trunk_3.0.x/core/src/main/java/org/apache/myfaces/shared/renderkit/html/HtmlResponseWriterImpl.java
URL: http://svn.apache.org/viewvc/myfaces/shared/trunk_3.0.x/core/src/main/java/org/apache/myfaces/shared/renderkit/html/HtmlResponseWriterImpl.java?rev=779412&r1=779411&r2=779412&view=diff
==============================================================================
--- myfaces/shared/trunk_3.0.x/core/src/main/java/org/apache/myfaces/shared/renderkit/html/HtmlResponseWriterImpl.java (original)
+++ myfaces/shared/trunk_3.0.x/core/src/main/java/org/apache/myfaces/shared/renderkit/html/HtmlResponseWriterImpl.java Thu May 28 02:36:25 2009
@@ -385,7 +385,8 @@
                 }
             }
             */
-            _writer.write(strValue);
+            //_writer.write(strValue);
+            _writer.write(org.apache.myfaces.shared.renderkit.html.util.HTMLEncoder.encodeURIAtributte(strValue, _characterEncoding));
         }
         _writer.write('"');
     }

Modified: myfaces/shared/trunk_3.0.x/core/src/main/java/org/apache/myfaces/shared/renderkit/html/util/HTMLEncoder.java
URL: http://svn.apache.org/viewvc/myfaces/shared/trunk_3.0.x/core/src/main/java/org/apache/myfaces/shared/renderkit/html/util/HTMLEncoder.java?rev=779412&r1=779411&r2=779412&view=diff
==============================================================================
--- myfaces/shared/trunk_3.0.x/core/src/main/java/org/apache/myfaces/shared/renderkit/html/util/HTMLEncoder.java (original)
+++ myfaces/shared/trunk_3.0.x/core/src/main/java/org/apache/myfaces/shared/renderkit/html/util/HTMLEncoder.java Thu May 28 02:36:25 2009
@@ -18,7 +18,9 @@
  */
 package org.apache.myfaces.shared.renderkit.html.util;
 
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.io.OutputStreamWriter;
 import java.io.Writer;
 
 /**
@@ -274,4 +276,394 @@
             writer.write(sb.toString());
         }
     }
+    
+    private static final String HEX_CHARSET = "0123456789ABCDEF";
+    
+    private static final String UTF8 = "UTF-8";
+    
+    /**
+     * Encode an URI, escaping or percent-encoding all required characters and
+     * following the rules mentioned on RFC 3986.  
+     * 
+     * @param string
+     * @param encodeNonLatin
+     * @return
+     * @throws IOException
+     */
+    public static String encodeURIAtributte(final String string, final String characterEncoding)
+        throws IOException
+    {
+        StringBuilder sb = null;    //create later on demand
+        String app;
+        char c;
+        boolean endLoop = false;
+        for (int i = 0; i < string.length (); ++i)
+        {
+            app = null;
+            c = string.charAt(i);
+            
+            // This are the guidelines to be taken into account by this algorithm to encode:
+            
+            // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
+            //
+            // control     = <US-ASCII coded characters 00-1F and 7F hexadecimal>
+            // space       = <US-ASCII coded character 20 hexadecimal>
+            // delims      = "<" | ">" | "#" | "%" | <">
+            //               %3C   %3E   %23   %25   %22
+            // unwise      = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
+            //               %7D   %7B   %7C   %5C   %5E   %5B   %5D   %60
+            //
+            // ".... Data corresponding to excluded characters must be escaped in order to
+            // be properly represented within a URI....."
+            
+            // RFC 3986 Section 3.  Syntax Components
+            //
+            // "... The generic URI syntax consists of a hierarchical sequence of
+            // components referred to as the scheme, authority, path, query, and
+            // fragment.
+            //
+            //   URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+            //
+            //   hier-part   = "//" authority path-abempty
+            //               / path-absolute
+            //               / path-rootless
+            //               / path-empty
+            // ...."
+            
+            // RFC 3986 Section 2.2:
+            // Reserved characters (should not be percent-encoded)
+            // reserved    = gen-delims / sub-delims
+            // gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+            //               %3A   %2F   %3F   %23   %5B   %5D   %40
+            // sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
+            //               %21   %24   %26   %27   %28   %29   %2A   %2B   %2C   %3B   %3D
+            
+            // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
+            // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) 
+            // "...those rules were redefined to directly specify the characters allowed...."
+            // There is also other characters moved from excluded list to reserved:
+            // "[" / "]" / "#"  
+            
+            // RFC 3986 Section 2.3:
+            // "... for consistency, percent-encoded octets in the ranges of ALPHA
+            // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
+            // underscore (%5F), or tilde (%7E) should not be created by URI
+            // producers...."
+            
+            // RFC 3986 Section  3.2.2.  Host
+
+            // host = IP-literal / IPv4address / reg-name
+
+            // The reg-name syntax allows percent-encoded octets in order to
+            // represent non-ASCII registered names in a uniform way that is
+            // independent of the underlying name resolution technology.  Non-ASCII
+            // characters must first be encoded according to UTF-8 [STD63], and then
+            // each octet of the corresponding UTF-8 sequence must be percent-
+            // encoded to be represented as URI characters.  URI producing
+            // applications must not use percent-encoding in host unless it is used
+            // to represent a UTF-8 character sequence.
+            
+            // RFC 3986 Section 3.4 Query 
+            //         query       = *( pchar / "/" / "?" )
+            //
+            // "...  However, as query components are often used to carry identifying information 
+            // in the form of "key=value" pairs and one frequently used value is a reference to
+            // another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
+            //
+            // RFC 3986 Section 2.5 Identifying Data (Apply to query section)
+            //
+            // When a new URI scheme defines a component that represents textual
+            // data consisting of characters from the Universal Character Set [UCS],
+            // the data should first be encoded as octets according to the UTF-8
+            // character encoding [STD63]; then only those octets that do not
+            // correspond to characters in the unreserved set should be percent-
+            // encoded.  For example, the character A would be represented as "A",
+            // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
+            // as "%C3%80", and the character KATAKANA LETTER A would be represented
+            // as "%E3%82%A2".
+            //
+            // RFC 3986 Section 3.5 Fragment
+            //         fragment    = *( pchar / "/" / "?" )
+            //
+            // Note that follows the same as query
+            
+            // Based on the extracts the strategy to apply on this method is:
+            // 
+            // On scheme ":" hier-part
+            //
+            // Escape or percent encode chars inside :
+            // 
+            // - From %00 to %20, 
+            // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
+            //                     duplicate encoding, encode it when we are sure 
+            //                     that there are not encoded twice)
+            // - "<" %3C, ">" %3E
+            // - "\" %5C, "^" %5E, "`" %60 
+            // - "{" %7B, "|" %7C, "}" %7D
+            // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
+            //   part of an URI, but it is preferred to encode it that omit it).
+            //
+            // The remaining characters must not be encoded
+            //
+            // Characters after ? or # should be percent encoding but only the necessary ones:
+            //
+            // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
+            // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
+            //                     duplicate encoding, encode it when we are sure 
+            //                     that there are not encoded twice)
+            // - "<" %3C, ">" %3E,
+            // - "\" %5C, "^" %5E, "`" %60 
+            // - "{" %7B, "|" %7C, "}" %7D
+            // - From %7F ad infinitum (each character as many bytes as necessary but take into account
+            //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
+            //   translating from the document character encoding to percent encoding, because this values
+            //   could be retrieved from httpRequest.getParameter() and it uses the current character encoding
+            //   for decode values)
+            //
+            // "&" should be encoded as "&amp;" because this link is inside an html page, and 
+            // put only & is invalid in this context.
+
+            if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
+                    c == '"' || c == '<' ||
+                    c == '>' || c == '\\' || c == '^' || c == '`' ||
+                    c == '{' || c == '|' || c == '}')
+            {
+                // The percent encoding on this part should be done using UTF-8 charset
+                // as RFC 3986 Section 3.2.2 says.
+                // Also there is a reference on 
+                // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
+                // that recommend use of UTF-8 instead the document character encoding.
+                // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
+                app = percentEncode(c, "UTF-8");
+            }
+            else if (c == '%')
+            {
+                if (i + 2 < string.length())
+                {
+                    char c1 = string.charAt(i+1);
+                    char c2 = string.charAt(i+2);
+                    if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
+                        (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
+                    {
+                        // do not percent encode, because it could be already encoded
+                        // and we don't want encode it twice
+                    }
+                    else
+                    {
+                        app = percentEncode(c, UTF8);
+                    }
+                }
+                else
+                {
+                    app = percentEncode(c, UTF8);
+                }
+            }
+            else if (c == '?' || c == '#')
+            {
+                if (i+1 < string.length())
+                {
+                    // The remaining part of the URI are data that should be encoded
+                    // using the document character encoding.
+                    app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
+                    endLoop = true;
+                }
+            }
+            else
+            {
+                //No encoding, just do nothing, char will be added later.
+            }
+                        
+            if (app != null)
+            {
+                if (sb == null)
+                {
+                    sb = new StringBuilder(string.substring(0, i));
+                }
+                sb.append(app);
+            } else {
+                if (sb != null)
+                {
+                    sb.append(c);
+                }
+            }
+            if (endLoop)
+            {
+                break;
+            }
+        }
+        if (sb == null)
+        {
+            return string;
+        }
+        else
+        {
+            return sb.toString();
+        }
+    }
+    
+    /**
+     * Encode a unicode char value in percentEncode, decoding its bytes using a specified 
+     * characterEncoding.
+     * 
+     * @param c
+     * @param characterEncoding
+     * @return
+     */
+    private static String percentEncode(char c, String characterEncoding)
+    {
+        String app = null;
+        if (c > (char)((short)0x007F))
+        {
+            //percent encode in the proper encoding to be consistent
+            app = percentEncodeNonUsAsciiCharacter(c, characterEncoding);
+        }
+        else
+        {
+            //percent encode US-ASCII char (0x00-0x7F range)
+            app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10);
+        }
+        return app;
+    }
+    
+    private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding)
+    {
+        ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
+        StringBuffer builder = new StringBuffer();
+        try
+        {
+            OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
+            writer.write(c);
+            writer.flush();
+        }
+        catch(IOException e)
+        {
+            baos.reset();
+            return null;
+        }
+        
+        byte [] byteArray =  baos.toByteArray();
+        for (int i=0; i < byteArray.length; i++)
+        {
+            builder.append('%');
+            builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
+            builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
+        }
+        
+        return builder.toString();
+    }
+
+    /**
+     * Encode the query part using the document charset encoding provided.
+     * 
+     * 
+     * @param string
+     * @param characterEncoding
+     * @return
+     */
+    private static String encodeURIQuery(final String string, final String characterEncoding)
+    {
+        StringBuilder sb = null;    //create later on demand
+        String app;
+        char c;
+        boolean endLoop = false;
+        for (int i = 0; i < string.length (); ++i)
+        {
+            app = null;
+            c = string.charAt(i);
+            
+            // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
+            // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so we make easier and omit this one)
+            // - "<" %3C, ">" %3E,
+            // - "\" %5C, "^" %5E, "`" %60 
+            // - "{" %7B, "|" %7C, "}" %7D
+            // - From %7F ad infinitum (each character as many bytes as necessary but take into account
+            //   that a single char should contain 2,3 or more bytes!. This data should be encoded translating from the document
+            //   character encoding to percent encoding)
+            //
+            // "&" should be encoded as "&amp;" because this link is inside an html page, and 
+            // put & is invalid in this context   
+            
+            if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
+                    c == '"' || c == '<' ||
+                    c == '>' || c == '\\' || c == '^' || c == '`' ||
+                    c == '{' || c == '|' || c == '}')
+            {
+                // The percent encoding on this part should be done using UTF-8 charset
+                // as RFC 3986 Section 3.2.2 says
+                app = percentEncode(c, characterEncoding);
+            }
+            else if (c == '%')
+            {
+                if (i + 2 < string.length())
+                {
+                    char c1 = string.charAt(i+1);
+                    char c2 = string.charAt(i+2);
+                    if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
+                        (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
+                    {
+                        // do not percent encode, because it could be already encoded
+                    }
+                    else
+                    {
+                        app = percentEncode(c, characterEncoding);
+                    }
+                }
+                else
+                {
+                    app = percentEncode(c, characterEncoding);
+                }
+            }
+            else if (c == '&')
+            {
+                if (i+4 < string.length() )
+                {
+                    if ('a' == string.charAt(i+1) &&
+                        'm' == string.charAt(i+2) &&
+                        'p' == string.charAt(i+3) &&
+                        ';' == string.charAt(i+4))
+                    {
+                        //Skip
+                    }
+                    else
+                    {
+                        app = "&amp;";
+                    }
+                }
+                else
+                {
+                    app = "&amp;";
+                }
+            }
+            else
+            {
+                //No encoding, just do nothing, char will be added later.
+            }
+                        
+            if (app != null)
+            {
+                if (sb == null)
+                {
+                    sb = new StringBuilder(string.substring(0, i));
+                }
+                sb.append(app);
+            } else {
+                if (sb != null)
+                {
+                    sb.append(c);
+                }
+            }
+            if (endLoop)
+            {
+                break;
+            }
+        }
+        if (sb == null)
+        {
+            return string;
+        }
+        else
+        {
+            return sb.toString();
+        }
+    }
 }

Modified: myfaces/shared/trunk_3.0.x/core/src/test/java/org/apache/myfaces/shared/renderkit/html/util/HTMLEncoderTest.java
URL: http://svn.apache.org/viewvc/myfaces/shared/trunk_3.0.x/core/src/test/java/org/apache/myfaces/shared/renderkit/html/util/HTMLEncoderTest.java?rev=779412&r1=779411&r2=779412&view=diff
==============================================================================
--- myfaces/shared/trunk_3.0.x/core/src/test/java/org/apache/myfaces/shared/renderkit/html/util/HTMLEncoderTest.java (original)
+++ myfaces/shared/trunk_3.0.x/core/src/test/java/org/apache/myfaces/shared/renderkit/html/util/HTMLEncoderTest.java Thu May 28 02:36:25 2009
@@ -16,9 +16,13 @@
 
 package org.apache.myfaces.shared.renderkit.html.util;
 
+import java.io.ByteArrayOutputStream;
 import java.io.CharArrayWriter;
 import java.io.IOException;
-import java.io.Writer;
+import java.io.OutputStreamWriter;
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
 
 import org.apache.shale.test.base.AbstractJsfTestCase;
 
@@ -236,4 +240,105 @@
       assertEquals(expected[i], actual[i]);
     }
   }
+  
+  public void testSimpleWriteURIAttribute() throws Exception
+  {
+      String cad1 = "http://myfaces.apache.org/hello.jsf?key1=val&key2=val2#id";
+      String cad2 = "http://myfaces.apache.org/hello.jsf?key1=val&amp;key2=val2#id";
+      String cad3 = HTMLEncoder.encodeURIAtributte(cad1,"UTF-8");
+      assertEquals(cad2, cad3);      
+  }
+  
+  public void testUsAsciiEscapedCharactersBeforeQuery() throws Exception
+  {
+      // Escape
+      // - From %00 to %20, 
+      // - <"> %22, "%" %25
+      // - "<" %3C, ">" %3E,
+      // - "\" %5C, "^" %5E, "`" %60 
+      // - "{" %7B, "|" %7C, "}" %7D
+      // - From %7F ad infinitum
+      String cad1 = "?key=\"%<>\\`{|}^\n "; //Omit %
+      String cad2 = "?key=%22%25%3C%3E%5C%60%7B%7C%7D%5E%0A%20";
+      String cad3 = HTMLEncoder.encodeURIAtributte(cad1,"UTF-8");
+      assertEquals(cad2, cad3);
+      
+      String cad4 = "\"%<>\\`{|}^\n ";
+      String cad5 = "%22%25%3C%3E%5C%60%7B%7C%7D%5E%0A%20";
+      String cad6 = HTMLEncoder.encodeURIAtributte(cad4,"UTF-8");
+      assertEquals(cad5, cad6);
+      
+      
+  }
+  
+  public void testWriteNonUsAsciiOnURIAttribute() throws Exception
+  {
+      // Character ü in ISO-8859-1 is %FC but on UTF-8 is %C3%BC. In this case,
+      // it should encode as %C3%BC
+      String cad1 = "ü";//"http://myfaces.apache.org/heüll o.jsf?key=val#id";
+      String cad2 = "%C3%BC";//"http://myfaces.apache.org/he%FCll%20o.jsf?key=val#id";
+      String cad3 = HTMLEncoder.encodeURIAtributte(cad1,"UTF-8");
+      assertEquals(cad2, cad3);
+
+  }
+  
+  public void testReservedCharactersOnURIAttribute() throws Exception
+  {
+      //Reserved
+      // Reserved characters (should not be percent-encoded)
+      // reserved    = gen-delims / sub-delims
+      // gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+      //               %3A   %2F   %3F   %23   %5B   %5D   %40
+      // sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
+      //               %21   %24   %26   %27   %28   %29   %2A   %2B   %2C   %3B   %3D
+      
+      String cad1 = "?key=:/[]@!$'()*+,;="; //Omit &
+      String cad2 = HTMLEncoder.encodeURIAtributte(cad1,"UTF-8");
+      assertEquals(cad1, cad2);
+      
+      String cad7 = ":/[]@!$&'()*+,;=";
+      String cad8 = HTMLEncoder.encodeURIAtributte(cad7,"UTF-8");
+      assertEquals(cad7, cad8);
+  }
+
+  public void testNonEncodedCharactersOnURIAttribute() throws Exception
+  {
+      // "... for consistency, percent-encoded octets in the ranges of ALPHA
+      // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
+      // underscore (%5F), or tilde (%7E) should not be created by URI
+      // producers...."
+      String cad1 = "?key=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~";
+      String cad2 = HTMLEncoder.encodeURIAtributte(cad1,"UTF-8");
+      assertEquals(cad1, cad2);
+      
+      String cad3 = "#somefile?key=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~";
+      String cad4 = HTMLEncoder.encodeURIAtributte(cad3,"UTF-8");
+      assertEquals(cad3, cad4);
+      
+      String cad5 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~";
+      String cad6 = HTMLEncoder.encodeURIAtributte(cad5,"UTF-8");
+      assertEquals(cad5, cad6);
+  }
+
+  public void testWriteURIAttribute() throws Exception
+  {
+      //Note char 256 or 0x100 should not be passed or percent encoded, because it is not
+      //valid for URIs.
+      String cad11 = "¡¢£¤¥¦§¨©ª«¬®¯°±"+((char)(0xFF))+((char)(0x100));
+      String cad12 = "%C2%A1%C2%A2%C2%A3%C2%A4%C2%A5%C2%A6%C2%A7%C2%A8%C2%A9%C2%AA%C2%AB%C2%AC%C2%AD"+
+                     "%C2%AE%C2%AF%C2%B0%C2%B1%C3%BF%C4%80";
+      String cad13 = HTMLEncoder.encodeURIAtributte(cad11,"UTF-8");
+      assertEquals(cad12, cad13);
+      
+      String cad1 = "?key=¡¢£¤¥¦§¨©ª«¬®¯°±"+((char)(0xFF))+((char)(0x100));
+      String cad2 = "?key=%C2%A1%C2%A2%C2%A3%C2%A4%C2%A5%C2%A6%C2%A7%C2%A8%C2%A9%C2%AA%C2%AB%C2%AC%C2%AD"+
+                     "%C2%AE%C2%AF%C2%B0%C2%B1%C3%BF%C4%80";
+      String cad3 = HTMLEncoder.encodeURIAtributte(cad1,"UTF-8");
+      assertEquals(cad2, cad3);
+            
+      //String cad14 = "http://myfaces.apache.org/page.jsf?key="+((char)0xFF)+((char)0x100);
+      //String cad15 = HTMLEncoder.encodeURIAtributte(cad14,false);
+      //assertEquals(cad14,cad15);
+  }
+    
 }
\ No newline at end of file