You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@wookie.apache.org by ps...@apache.org on 2011/10/07 13:20:23 UTC
svn commit: r1180006 - in
/incubator/wookie/trunk/src/org/apache/wookie/util/html:
Html5DoctypeToken.java HtmlCleaner.java
Author: psharples
Date: Fri Oct 7 11:20:23 2011
New Revision: 1180006
URL: http://svn.apache.org/viewvc?rev=1180006&view=rev
Log:
Fix for HTMLCleaners bad rewriting of HTML5 doctypes. (it originally added a null and an empty string to <!DOCTYPE html> type declarations.
Added:
incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java (with props)
Modified:
incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java
Added: incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java
URL: http://svn.apache.org/viewvc/incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java?rev=1180006&view=auto
==============================================================================
--- incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java (added)
+++ incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java Fri Oct 7 11:20:23 2011
@@ -0,0 +1,42 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.wookie.util.html;
+
+import org.htmlcleaner.DoctypeToken;
+
+/**
+ *
+ * An extended HTML Cleaner DocTypeToken class to deal with HTML5 declarations better then the default, which displays emtpy strings and nulls.
+ *
+ * Note: <!DOCTYPE html SYSTEM "about:legacy-compat"> is also a valid HTML5 doctype - but html cleaner only makes the html
+ * into uppercase, which although is still not correct, doesn't seem to cause problems in wookie at present.
+ *
+ * http://sourceforge.net/tracker/?func=detail&aid=3190583&group_id=183053&atid=903696
+ *
+ */
+public class Html5DoctypeToken extends DoctypeToken {
+
+ public static String BADDOCTYPE = "<!DOCTYPE HTML null \"\">";
+ public static String GOODDOCTYPE = "<!DOCTYPE html>";
+
+ public Html5DoctypeToken(String part1, String part2, String part3,
+ String part4) {
+ super(part1, part2, part3, part4);
+ }
+
+ public String getContent(){
+ return GOODDOCTYPE;
+ }
+
+}
Propchange: incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified: incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java
URL: http://svn.apache.org/viewvc/incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java?rev=1180006&r1=1180005&r2=1180006&view=diff
==============================================================================
--- incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java (original)
+++ incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java Fri Oct 7 11:20:23 2011
@@ -20,6 +20,7 @@ import java.util.ArrayList;
import java.util.List;
import org.htmlcleaner.CleanerProperties;
+import org.htmlcleaner.DoctypeToken;
import org.htmlcleaner.TagNode;
/**
@@ -64,11 +65,12 @@ public class HtmlCleaner implements IHtm
public void setReader(Reader reader) throws IOException{
if (reader == null) throw new IOException("Reader was null");
this.reader = reader;
- htmlNode = cleaner.clean(this.reader);
+ htmlNode = cleaner.clean(this.reader);
headNode = htmlNode.findElementByName(HEAD_TAG, false);
// remove widget-specific scripts. These will be replaced
// after processing, so that the injected scripts come first
removeUserScripts();
+ fixHTML5Doctype();
}
/* (non-Javadoc)
@@ -167,5 +169,17 @@ public class HtmlCleaner implements IHtm
headNode.addChild(node);
}
}
+
+ /**
+ * Fix for a bug in HTMLCleaner which cannot handle HTML5 doctypes correctly
+ * See http://sourceforge.net/tracker/?func=detail&aid=3190583&group_id=183053&atid=903696
+ */
+ private void fixHTML5Doctype(){
+ DoctypeToken docType = htmlNode.getDocType();
+ if(docType.getContent().equalsIgnoreCase(Html5DoctypeToken.BADDOCTYPE)){
+ Html5DoctypeToken newToken = new Html5DoctypeToken("html",null,null,null);
+ htmlNode.setDocType(newToken);
+ }
+ }
}
Re: svn commit: r1180006 - in /incubator/wookie/trunk/src/org/apache/wookie/util/html:
Html5DoctypeToken.java HtmlCleaner.java
Posted by Paul Sharples <p....@bolton.ac.uk>.
On 07/10/2011 12:40, Scott Wilson wrote:
> There is an issue in the HtmlCleaner tracker for this:
>
> http://sourceforge.net/tracker/index.php?func=detail&aid=3190583&group_id=183053&atid=903696
Yub, I referenced this link in the wookie fix - see comments in line of
last commit. However, I would guess rather than subclassing as I have
done, they will probably want to fix the core base class so you don't
have to. I could always post something there pointing to our solution
for anybody else looking for a fix.
>
> On 7 Oct 2011, at 12:20, psharples@apache.org wrote:
>
>> Author: psharples
>> Date: Fri Oct 7 11:20:23 2011
>> New Revision: 1180006
>>
>> URL: http://svn.apache.org/viewvc?rev=1180006&view=rev
>> Log:
>> Fix for HTMLCleaners bad rewriting of HTML5 doctypes. (it originally added a null and an empty string to<!DOCTYPE html> type declarations.
>>
>> Added:
>> incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java (with props)
>> Modified:
>> incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java
>>
>> Added: incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java
>> URL: http://svn.apache.org/viewvc/incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java?rev=1180006&view=auto
>> ==============================================================================
>> --- incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java (added)
>> +++ incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java Fri Oct 7 11:20:23 2011
>> @@ -0,0 +1,42 @@
>> +/*
>> + * Licensed under the Apache License, Version 2.0 (the "License");
>> + * you may not use this file except in compliance with the License.
>> + * You may obtain a copy of the License at
>> + *
>> + * http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +package org.apache.wookie.util.html;
>> +
>> +import org.htmlcleaner.DoctypeToken;
>> +
>> +/**
>> + *
>> + * An extended HTML Cleaner DocTypeToken class to deal with HTML5 declarations better then the default, which displays emtpy strings and nulls.
>> + *
>> + * Note:<!DOCTYPE html SYSTEM "about:legacy-compat"> is also a valid HTML5 doctype - but html cleaner only makes the html
>> + * into uppercase, which although is still not correct, doesn't seem to cause problems in wookie at present.
>> + *
>> + * http://sourceforge.net/tracker/?func=detail&aid=3190583&group_id=183053&atid=903696
>> + *
>> + */
>> +public class Html5DoctypeToken extends DoctypeToken {
>> +
>> + public static String BADDOCTYPE = "<!DOCTYPE HTML null \"\">";
>> + public static String GOODDOCTYPE = "<!DOCTYPE html>";
>> +
>> + public Html5DoctypeToken(String part1, String part2, String part3,
>> + String part4) {
>> + super(part1, part2, part3, part4);
>> + }
>> +
>> + public String getContent(){
>> + return GOODDOCTYPE;
>> + }
>> +
>> +}
>>
>> Propchange: incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java
>> ------------------------------------------------------------------------------
>> svn:mime-type = text/plain
>>
>> Modified: incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java
>> URL: http://svn.apache.org/viewvc/incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java?rev=1180006&r1=1180005&r2=1180006&view=diff
>> ==============================================================================
>> --- incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java (original)
>> +++ incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java Fri Oct 7 11:20:23 2011
>> @@ -20,6 +20,7 @@ import java.util.ArrayList;
>> import java.util.List;
>>
>> import org.htmlcleaner.CleanerProperties;
>> +import org.htmlcleaner.DoctypeToken;
>> import org.htmlcleaner.TagNode;
>>
>> /**
>> @@ -64,11 +65,12 @@ public class HtmlCleaner implements IHtm
>> public void setReader(Reader reader) throws IOException{
>> if (reader == null) throw new IOException("Reader was null");
>> this.reader = reader;
>> - htmlNode = cleaner.clean(this.reader);
>> + htmlNode = cleaner.clean(this.reader);
>> headNode = htmlNode.findElementByName(HEAD_TAG, false);
>> // remove widget-specific scripts. These will be replaced
>> // after processing, so that the injected scripts come first
>> removeUserScripts();
>> + fixHTML5Doctype();
>> }
>>
>> /* (non-Javadoc)
>> @@ -167,5 +169,17 @@ public class HtmlCleaner implements IHtm
>> headNode.addChild(node);
>> }
>> }
>> +
>> + /**
>> + * Fix for a bug in HTMLCleaner which cannot handle HTML5 doctypes correctly
>> + * See http://sourceforge.net/tracker/?func=detail&aid=3190583&group_id=183053&atid=903696
>> + */
>> + private void fixHTML5Doctype(){
>> + DoctypeToken docType = htmlNode.getDocType();
>> + if(docType.getContent().equalsIgnoreCase(Html5DoctypeToken.BADDOCTYPE)){
>> + Html5DoctypeToken newToken = new Html5DoctypeToken("html",null,null,null);
>> + htmlNode.setDocType(newToken);
>> + }
>> + }
>>
>> }
>>
>>
>
Re: svn commit: r1180006 - in /incubator/wookie/trunk/src/org/apache/wookie/util/html: Html5DoctypeToken.java HtmlCleaner.java
Posted by Scott Wilson <sc...@gmail.com>.
There is an issue in the HtmlCleaner tracker for this:
http://sourceforge.net/tracker/index.php?func=detail&aid=3190583&group_id=183053&atid=903696
On 7 Oct 2011, at 12:20, psharples@apache.org wrote:
> Author: psharples
> Date: Fri Oct 7 11:20:23 2011
> New Revision: 1180006
>
> URL: http://svn.apache.org/viewvc?rev=1180006&view=rev
> Log:
> Fix for HTMLCleaners bad rewriting of HTML5 doctypes. (it originally added a null and an empty string to <!DOCTYPE html> type declarations.
>
> Added:
> incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java (with props)
> Modified:
> incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java
>
> Added: incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java
> URL: http://svn.apache.org/viewvc/incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java?rev=1180006&view=auto
> ==============================================================================
> --- incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java (added)
> +++ incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java Fri Oct 7 11:20:23 2011
> @@ -0,0 +1,42 @@
> +/*
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.wookie.util.html;
> +
> +import org.htmlcleaner.DoctypeToken;
> +
> +/**
> + *
> + * An extended HTML Cleaner DocTypeToken class to deal with HTML5 declarations better then the default, which displays emtpy strings and nulls.
> + *
> + * Note: <!DOCTYPE html SYSTEM "about:legacy-compat"> is also a valid HTML5 doctype - but html cleaner only makes the html
> + * into uppercase, which although is still not correct, doesn't seem to cause problems in wookie at present.
> + *
> + * http://sourceforge.net/tracker/?func=detail&aid=3190583&group_id=183053&atid=903696
> + *
> + */
> +public class Html5DoctypeToken extends DoctypeToken {
> +
> + public static String BADDOCTYPE = "<!DOCTYPE HTML null \"\">";
> + public static String GOODDOCTYPE = "<!DOCTYPE html>";
> +
> + public Html5DoctypeToken(String part1, String part2, String part3,
> + String part4) {
> + super(part1, part2, part3, part4);
> + }
> +
> + public String getContent(){
> + return GOODDOCTYPE;
> + }
> +
> +}
>
> Propchange: incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java
> ------------------------------------------------------------------------------
> svn:mime-type = text/plain
>
> Modified: incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java
> URL: http://svn.apache.org/viewvc/incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java?rev=1180006&r1=1180005&r2=1180006&view=diff
> ==============================================================================
> --- incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java (original)
> +++ incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java Fri Oct 7 11:20:23 2011
> @@ -20,6 +20,7 @@ import java.util.ArrayList;
> import java.util.List;
>
> import org.htmlcleaner.CleanerProperties;
> +import org.htmlcleaner.DoctypeToken;
> import org.htmlcleaner.TagNode;
>
> /**
> @@ -64,11 +65,12 @@ public class HtmlCleaner implements IHtm
> public void setReader(Reader reader) throws IOException{
> if (reader == null) throw new IOException("Reader was null");
> this.reader = reader;
> - htmlNode = cleaner.clean(this.reader);
> + htmlNode = cleaner.clean(this.reader);
> headNode = htmlNode.findElementByName(HEAD_TAG, false);
> // remove widget-specific scripts. These will be replaced
> // after processing, so that the injected scripts come first
> removeUserScripts();
> + fixHTML5Doctype();
> }
>
> /* (non-Javadoc)
> @@ -167,5 +169,17 @@ public class HtmlCleaner implements IHtm
> headNode.addChild(node);
> }
> }
> +
> + /**
> + * Fix for a bug in HTMLCleaner which cannot handle HTML5 doctypes correctly
> + * See http://sourceforge.net/tracker/?func=detail&aid=3190583&group_id=183053&atid=903696
> + */
> + private void fixHTML5Doctype(){
> + DoctypeToken docType = htmlNode.getDocType();
> + if(docType.getContent().equalsIgnoreCase(Html5DoctypeToken.BADDOCTYPE)){
> + Html5DoctypeToken newToken = new Html5DoctypeToken("html",null,null,null);
> + htmlNode.setDocType(newToken);
> + }
> + }
>
> }
>
>