You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@wookie.apache.org by ps...@apache.org on 2011/10/07 13:20:23 UTC

svn commit: r1180006 - in /incubator/wookie/trunk/src/org/apache/wookie/util/html: Html5DoctypeToken.java HtmlCleaner.java

Author: psharples
Date: Fri Oct  7 11:20:23 2011
New Revision: 1180006

URL: http://svn.apache.org/viewvc?rev=1180006&view=rev
Log:
Fix for HTMLCleaners bad rewriting of HTML5 doctypes. (it originally added a null and an empty string to <!DOCTYPE html> type declarations.

Added:
    incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java   (with props)
Modified:
    incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java

Added: incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java
URL: http://svn.apache.org/viewvc/incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java?rev=1180006&view=auto
==============================================================================
--- incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java (added)
+++ incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java Fri Oct  7 11:20:23 2011
@@ -0,0 +1,42 @@
+/*
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.wookie.util.html;
+
+import org.htmlcleaner.DoctypeToken;
+
+/**
+ * 
+ * An extended HTML Cleaner DocTypeToken class to deal with HTML5 declarations better then the default, which displays emtpy strings and nulls.
+ * 
+ * Note: <!DOCTYPE html SYSTEM "about:legacy-compat"> is also a valid HTML5 doctype - but html cleaner only makes the html
+ * into uppercase, which although is still not correct, doesn't seem to cause problems in wookie at present.
+ * 
+ * http://sourceforge.net/tracker/?func=detail&aid=3190583&group_id=183053&atid=903696
+ * 
+ */
+public class Html5DoctypeToken extends DoctypeToken {
+	
+	public static String BADDOCTYPE = "<!DOCTYPE HTML null \"\">";
+	public static String GOODDOCTYPE = "<!DOCTYPE html>";
+
+	public Html5DoctypeToken(String part1, String part2, String part3,
+			String part4) {
+		super(part1, part2, part3, part4);		
+	}
+		
+	public String getContent(){
+		return GOODDOCTYPE;
+	}
+
+}

Propchange: incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java
URL: http://svn.apache.org/viewvc/incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java?rev=1180006&r1=1180005&r2=1180006&view=diff
==============================================================================
--- incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java (original)
+++ incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java Fri Oct  7 11:20:23 2011
@@ -20,6 +20,7 @@ import java.util.ArrayList;
 import java.util.List;
 
 import org.htmlcleaner.CleanerProperties;
+import org.htmlcleaner.DoctypeToken;
 import org.htmlcleaner.TagNode;
 
 /**
@@ -64,11 +65,12 @@ public class HtmlCleaner implements IHtm
 	public void setReader(Reader reader) throws IOException{
 		if (reader == null) throw new IOException("Reader was null");
 		this.reader = reader;
-		htmlNode = cleaner.clean(this.reader);			
+		htmlNode = cleaner.clean(this.reader);
 		headNode = htmlNode.findElementByName(HEAD_TAG, false);	
 		// remove widget-specific scripts. These will be replaced
 		// after processing, so that the injected scripts come first
 		removeUserScripts();
+		fixHTML5Doctype();
 	}
 	
 	/* (non-Javadoc)
@@ -167,5 +169,17 @@ public class HtmlCleaner implements IHtm
 			headNode.addChild(node);
 		}
 	}
+	
+	/**
+	 *  Fix for a bug in HTMLCleaner which cannot handle HTML5 doctypes correctly
+	 *  See http://sourceforge.net/tracker/?func=detail&aid=3190583&group_id=183053&atid=903696
+	 */
+	private void fixHTML5Doctype(){
+		DoctypeToken docType = htmlNode.getDocType();
+		if(docType.getContent().equalsIgnoreCase(Html5DoctypeToken.BADDOCTYPE)){
+			Html5DoctypeToken newToken = new Html5DoctypeToken("html",null,null,null);
+			htmlNode.setDocType(newToken);
+		}
+	}
 
 }



Re: svn commit: r1180006 - in /incubator/wookie/trunk/src/org/apache/wookie/util/html: Html5DoctypeToken.java HtmlCleaner.java

Posted by Paul Sharples <p....@bolton.ac.uk>.
On 07/10/2011 12:40, Scott Wilson wrote:
> There is an issue in the HtmlCleaner tracker for this:
>
> http://sourceforge.net/tracker/index.php?func=detail&aid=3190583&group_id=183053&atid=903696

Yub, I referenced this link in the wookie fix - see comments in line of 
last commit.  However, I would guess rather than subclassing as I have 
done, they will probably want to fix the core base class so you don't 
have to. I could always post something there pointing to our solution 
for anybody else looking for a fix.

>
> On 7 Oct 2011, at 12:20, psharples@apache.org wrote:
>
>> Author: psharples
>> Date: Fri Oct  7 11:20:23 2011
>> New Revision: 1180006
>>
>> URL: http://svn.apache.org/viewvc?rev=1180006&view=rev
>> Log:
>> Fix for HTMLCleaners bad rewriting of HTML5 doctypes. (it originally added a null and an empty string to<!DOCTYPE html>  type declarations.
>>
>> Added:
>>     incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java   (with props)
>> Modified:
>>     incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java
>>
>> Added: incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java
>> URL: http://svn.apache.org/viewvc/incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java?rev=1180006&view=auto
>> ==============================================================================
>> --- incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java (added)
>> +++ incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java Fri Oct  7 11:20:23 2011
>> @@ -0,0 +1,42 @@
>> +/*
>> + *  Licensed under the Apache License, Version 2.0 (the "License");
>> + *  you may not use this file except in compliance with the License.
>> + *  You may obtain a copy of the License at
>> + *
>> + *      http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + *  Unless required by applicable law or agreed to in writing, software
>> + *  distributed under the License is distributed on an "AS IS" BASIS,
>> + *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> + *  See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +package org.apache.wookie.util.html;
>> +
>> +import org.htmlcleaner.DoctypeToken;
>> +
>> +/**
>> + *
>> + * An extended HTML Cleaner DocTypeToken class to deal with HTML5 declarations better then the default, which displays emtpy strings and nulls.
>> + *
>> + * Note:<!DOCTYPE html SYSTEM "about:legacy-compat">  is also a valid HTML5 doctype - but html cleaner only makes the html
>> + * into uppercase, which although is still not correct, doesn't seem to cause problems in wookie at present.
>> + *
>> + * http://sourceforge.net/tracker/?func=detail&aid=3190583&group_id=183053&atid=903696
>> + *
>> + */
>> +public class Html5DoctypeToken extends DoctypeToken {
>> +	
>> +	public static String BADDOCTYPE = "<!DOCTYPE HTML null \"\">";
>> +	public static String GOODDOCTYPE = "<!DOCTYPE html>";
>> +
>> +	public Html5DoctypeToken(String part1, String part2, String part3,
>> +			String part4) {
>> +		super(part1, part2, part3, part4);		
>> +	}
>> +		
>> +	public String getContent(){
>> +		return GOODDOCTYPE;
>> +	}
>> +
>> +}
>>
>> Propchange: incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java
>> ------------------------------------------------------------------------------
>>     svn:mime-type = text/plain
>>
>> Modified: incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java
>> URL: http://svn.apache.org/viewvc/incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java?rev=1180006&r1=1180005&r2=1180006&view=diff
>> ==============================================================================
>> --- incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java (original)
>> +++ incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java Fri Oct  7 11:20:23 2011
>> @@ -20,6 +20,7 @@ import java.util.ArrayList;
>> import java.util.List;
>>
>> import org.htmlcleaner.CleanerProperties;
>> +import org.htmlcleaner.DoctypeToken;
>> import org.htmlcleaner.TagNode;
>>
>> /**
>> @@ -64,11 +65,12 @@ public class HtmlCleaner implements IHtm
>> 	public void setReader(Reader reader) throws IOException{
>> 		if (reader == null) throw new IOException("Reader was null");
>> 		this.reader = reader;
>> -		htmlNode = cleaner.clean(this.reader);			
>> +		htmlNode = cleaner.clean(this.reader);
>> 		headNode = htmlNode.findElementByName(HEAD_TAG, false);	
>> 		// remove widget-specific scripts. These will be replaced
>> 		// after processing, so that the injected scripts come first
>> 		removeUserScripts();
>> +		fixHTML5Doctype();
>> 	}
>> 	
>> 	/* (non-Javadoc)
>> @@ -167,5 +169,17 @@ public class HtmlCleaner implements IHtm
>> 			headNode.addChild(node);
>> 		}
>> 	}
>> +	
>> +	/**
>> +	 *  Fix for a bug in HTMLCleaner which cannot handle HTML5 doctypes correctly
>> +	 *  See http://sourceforge.net/tracker/?func=detail&aid=3190583&group_id=183053&atid=903696
>> +	 */
>> +	private void fixHTML5Doctype(){
>> +		DoctypeToken docType = htmlNode.getDocType();
>> +		if(docType.getContent().equalsIgnoreCase(Html5DoctypeToken.BADDOCTYPE)){
>> +			Html5DoctypeToken newToken = new Html5DoctypeToken("html",null,null,null);
>> +			htmlNode.setDocType(newToken);
>> +		}
>> +	}
>>
>> }
>>
>>
>


Re: svn commit: r1180006 - in /incubator/wookie/trunk/src/org/apache/wookie/util/html: Html5DoctypeToken.java HtmlCleaner.java

Posted by Scott Wilson <sc...@gmail.com>.
There is an issue in the HtmlCleaner tracker for this:

http://sourceforge.net/tracker/index.php?func=detail&aid=3190583&group_id=183053&atid=903696

On 7 Oct 2011, at 12:20, psharples@apache.org wrote:

> Author: psharples
> Date: Fri Oct  7 11:20:23 2011
> New Revision: 1180006
> 
> URL: http://svn.apache.org/viewvc?rev=1180006&view=rev
> Log:
> Fix for HTMLCleaners bad rewriting of HTML5 doctypes. (it originally added a null and an empty string to <!DOCTYPE html> type declarations.
> 
> Added:
>    incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java   (with props)
> Modified:
>    incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java
> 
> Added: incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java
> URL: http://svn.apache.org/viewvc/incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java?rev=1180006&view=auto
> ==============================================================================
> --- incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java (added)
> +++ incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java Fri Oct  7 11:20:23 2011
> @@ -0,0 +1,42 @@
> +/*
> + *  Licensed under the Apache License, Version 2.0 (the "License");
> + *  you may not use this file except in compliance with the License.
> + *  You may obtain a copy of the License at
> + *
> + *      http://www.apache.org/licenses/LICENSE-2.0
> + *
> + *  Unless required by applicable law or agreed to in writing, software
> + *  distributed under the License is distributed on an "AS IS" BASIS,
> + *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + *  See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.wookie.util.html;
> +
> +import org.htmlcleaner.DoctypeToken;
> +
> +/**
> + * 
> + * An extended HTML Cleaner DocTypeToken class to deal with HTML5 declarations better then the default, which displays emtpy strings and nulls.
> + * 
> + * Note: <!DOCTYPE html SYSTEM "about:legacy-compat"> is also a valid HTML5 doctype - but html cleaner only makes the html
> + * into uppercase, which although is still not correct, doesn't seem to cause problems in wookie at present.
> + * 
> + * http://sourceforge.net/tracker/?func=detail&aid=3190583&group_id=183053&atid=903696
> + * 
> + */
> +public class Html5DoctypeToken extends DoctypeToken {
> +	
> +	public static String BADDOCTYPE = "<!DOCTYPE HTML null \"\">";
> +	public static String GOODDOCTYPE = "<!DOCTYPE html>";
> +
> +	public Html5DoctypeToken(String part1, String part2, String part3,
> +			String part4) {
> +		super(part1, part2, part3, part4);		
> +	}
> +		
> +	public String getContent(){
> +		return GOODDOCTYPE;
> +	}
> +
> +}
> 
> Propchange: incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java
> ------------------------------------------------------------------------------
>    svn:mime-type = text/plain
> 
> Modified: incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java
> URL: http://svn.apache.org/viewvc/incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java?rev=1180006&r1=1180005&r2=1180006&view=diff
> ==============================================================================
> --- incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java (original)
> +++ incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java Fri Oct  7 11:20:23 2011
> @@ -20,6 +20,7 @@ import java.util.ArrayList;
> import java.util.List;
> 
> import org.htmlcleaner.CleanerProperties;
> +import org.htmlcleaner.DoctypeToken;
> import org.htmlcleaner.TagNode;
> 
> /**
> @@ -64,11 +65,12 @@ public class HtmlCleaner implements IHtm
> 	public void setReader(Reader reader) throws IOException{
> 		if (reader == null) throw new IOException("Reader was null");
> 		this.reader = reader;
> -		htmlNode = cleaner.clean(this.reader);			
> +		htmlNode = cleaner.clean(this.reader);
> 		headNode = htmlNode.findElementByName(HEAD_TAG, false);	
> 		// remove widget-specific scripts. These will be replaced
> 		// after processing, so that the injected scripts come first
> 		removeUserScripts();
> +		fixHTML5Doctype();
> 	}
> 	
> 	/* (non-Javadoc)
> @@ -167,5 +169,17 @@ public class HtmlCleaner implements IHtm
> 			headNode.addChild(node);
> 		}
> 	}
> +	
> +	/**
> +	 *  Fix for a bug in HTMLCleaner which cannot handle HTML5 doctypes correctly
> +	 *  See http://sourceforge.net/tracker/?func=detail&aid=3190583&group_id=183053&atid=903696
> +	 */
> +	private void fixHTML5Doctype(){
> +		DoctypeToken docType = htmlNode.getDocType();
> +		if(docType.getContent().equalsIgnoreCase(Html5DoctypeToken.BADDOCTYPE)){
> +			Html5DoctypeToken newToken = new Html5DoctypeToken("html",null,null,null);
> +			htmlNode.setDocType(newToken);
> +		}
> +	}
> 
> }
> 
>