You are viewing a plain text version of this content. The canonical link for it is here.
Posted to cvs@cocoon.apache.org by jb...@apache.org on 2006/03/25 10:13:47 UTC

svn commit: r388734 - in /cocoon/branches/BRANCH_2_1_X: ./ src/blocks/html/WEB-INF/ src/blocks/html/conf/ src/blocks/html/java/org/apache/cocoon/generation/ src/blocks/html/java/org/apache/cocoon/transformation/ src/blocks/html/samples/ src/blocks/html...

Author: jbq
Date: Sat Mar 25 01:13:44 2006
New Revision: 388734

URL: http://svn.apache.org/viewcvs?rev=388734&view=rev
Log:
COCOON-1639: NekoHTMLTransformer

Also added stylesheet (apache-no-namespace.xsl) for sample of NekoHTMLGenerator

Added:
    cocoon/branches/BRANCH_2_1_X/src/blocks/html/WEB-INF/neko.properties   (with props)
    cocoon/branches/BRANCH_2_1_X/src/blocks/html/java/org/apache/cocoon/transformation/NekoHTMLTransformer.java   (with props)
    cocoon/branches/BRANCH_2_1_X/src/blocks/html/samples/stylesheets/apache-no-namespace.xsl   (with props)
Modified:
    cocoon/branches/BRANCH_2_1_X/src/blocks/html/conf/html-transformer.xmap
    cocoon/branches/BRANCH_2_1_X/src/blocks/html/conf/html.xmap
    cocoon/branches/BRANCH_2_1_X/src/blocks/html/java/org/apache/cocoon/generation/NekoHTMLGenerator.java
    cocoon/branches/BRANCH_2_1_X/src/blocks/html/samples/samples.xml
    cocoon/branches/BRANCH_2_1_X/src/blocks/html/samples/sitemap.xmap
    cocoon/branches/BRANCH_2_1_X/status.xml

Added: cocoon/branches/BRANCH_2_1_X/src/blocks/html/WEB-INF/neko.properties
URL: http://svn.apache.org/viewcvs/cocoon/branches/BRANCH_2_1_X/src/blocks/html/WEB-INF/neko.properties?rev=388734&view=auto
==============================================================================
--- cocoon/branches/BRANCH_2_1_X/src/blocks/html/WEB-INF/neko.properties (added)
+++ cocoon/branches/BRANCH_2_1_X/src/blocks/html/WEB-INF/neko.properties Sat Mar 25 01:13:44 2006
@@ -0,0 +1,33 @@
+# Properties file used by NekoHTMLGenerator, NekoHTMLTransformer.
+# List compiled based on NekoHTML 0.9.5, see also:
+#     http://people.apache.org/~andyc/neko/doc/html/settings.html
+
+# values below are commented out as they are the defaults anyway
+
+#http\://xml.org/sax/features/namespaces=true
+#http\://cyberneko.org/html/features/balance-tags=true
+#http\://cyberneko.org/html/features/override-doctype=false
+#http\://cyberneko.org/html/features/insert-doctype=false
+#http\://cyberneko.org/html/features/override-namespaces=false
+#http\://cyberneko.org/html/features/insert-namespaces=false
+#http\://cyberneko.org/html/features/balance-tags/ignore-outside-content=false
+#http\://cyberneko.org/html/features/balance-tags/document-fragment=false
+#http\://cyberneko.org/html/features/scanner/cdata-sections=false
+#http\://apache.org/xml/features/scanner/notify-char-refs=false
+#http\://apache.org/xml/features/scanner/notify-builtin-refs=false
+#http\://cyberneko.org/html/features/scanner/notify-builtin-refs=false
+#http\://cyberneko.org/html/features/scanner/fix-mswindows-refs=false
+#http\://cyberneko.org/html/features/scanner/ignore-specified-charset=false
+#http\://cyberneko.org/html/features/scanner/script/strip-comment-delims=false
+#http\://cyberneko.org/html/features/scanner/script/strip-cdata-delims=false
+#http\://cyberneko.org/html/features/scanner/style/strip-comment-delims=false
+#http\://cyberneko.org/html/features/scanner/style/strip-cdata-delims=false
+#http\://cyberneko.org/html/features/augmentations=false
+#http\://cyberneko.org/html/features/report-errors=false
+#http\://cyberneko.org/html/properties/default-encoding=Windows-1252
+# NB Neko default for names/elems is "upper", but generator/transformer override this
+#http\://cyberneko.org/html/properties/names/elems=lower
+#http\://cyberneko.org/html/properties/names/attrs=lower
+#http\://cyberneko.org/html/properties/doctype/pubid=-//W3C//DTD HTML 4.01 Transitional//EN
+#http\://cyberneko.org/html/properties/doctype/sysid=http://www.w3.org/TR/html4/loose.dtd
+#http\://cyberneko.org/html/properties/namespaces-uri=http://www.w3.org/1999/xhtml

Propchange: cocoon/branches/BRANCH_2_1_X/src/blocks/html/WEB-INF/neko.properties
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: cocoon/branches/BRANCH_2_1_X/src/blocks/html/conf/html-transformer.xmap
URL: http://svn.apache.org/viewcvs/cocoon/branches/BRANCH_2_1_X/src/blocks/html/conf/html-transformer.xmap?rev=388734&r1=388733&r2=388734&view=diff
==============================================================================
--- cocoon/branches/BRANCH_2_1_X/src/blocks/html/conf/html-transformer.xmap (original)
+++ cocoon/branches/BRANCH_2_1_X/src/blocks/html/conf/html-transformer.xmap Sat Mar 25 01:13:44 2006
@@ -22,8 +22,19 @@
     <map:transformer
       name="html"
       logger="sitemap.transformer.html"
-      src="org.apache.cocoon.transformation.HTMLTransformer"
-    />
+      src="org.apache.cocoon.transformation.HTMLTransformer">
+      <!-- Tidy configuration file.
+      <jtidy-config>context://WEB-INF/tidy.properties</jtidy-config>
+      -->
+    </map:transformer>
+    <map:transformer
+      name="nekohtml"
+      logger="sitemap.transformer.html"
+      src="org.apache.cocoon.transformation.NekoHTMLTransformer">
+      <!-- NekoHTML configuration file.
+      <neko-config>context://WEB-INF/neko.properties</neko-config>
+      -->
+    </map:transformer>
 
     <map:transformer
         name="htmlcleanup"

Modified: cocoon/branches/BRANCH_2_1_X/src/blocks/html/conf/html.xmap
URL: http://svn.apache.org/viewcvs/cocoon/branches/BRANCH_2_1_X/src/blocks/html/conf/html.xmap?rev=388734&r1=388733&r2=388734&view=diff
==============================================================================
--- cocoon/branches/BRANCH_2_1_X/src/blocks/html/conf/html.xmap (original)
+++ cocoon/branches/BRANCH_2_1_X/src/blocks/html/conf/html.xmap Sat Mar 25 01:13:44 2006
@@ -31,8 +31,8 @@
                    logger="sitemap.generator.html"
                    src="org.apache.cocoon.generation.NekoHTMLGenerator"
                    label="content">
-      <!-- Tidy configuration file.
-      <neko-config>???</neko-config>
+      <!-- NekoHTML configuration file.
+      <neko-config>context://WEB-INF/neko.properties</neko-config>
       -->
     </map:generator>
 </xmap>

Modified: cocoon/branches/BRANCH_2_1_X/src/blocks/html/java/org/apache/cocoon/generation/NekoHTMLGenerator.java
URL: http://svn.apache.org/viewcvs/cocoon/branches/BRANCH_2_1_X/src/blocks/html/java/org/apache/cocoon/generation/NekoHTMLGenerator.java?rev=388734&r1=388733&r2=388734&view=diff
==============================================================================
--- cocoon/branches/BRANCH_2_1_X/src/blocks/html/java/org/apache/cocoon/generation/NekoHTMLGenerator.java (original)
+++ cocoon/branches/BRANCH_2_1_X/src/blocks/html/java/org/apache/cocoon/generation/NekoHTMLGenerator.java Sat Mar 25 01:13:44 2006
@@ -316,7 +316,11 @@
             if (properties != null) {
                 for (Iterator i = properties.keySet().iterator();i.hasNext();) {
                     String name = (String) i.next();
-                    config.setProperty(name, properties.getProperty(name));
+                    if (name.indexOf("/features/") > -1) {
+                        config.setFeature(name, Boolean.getBoolean(properties.getProperty(name)));
+                    } else if (name.indexOf("/properties/") > -1) {
+                        config.setProperty(name, properties.getProperty(name));
+                    }
                 }
             }
             return config;

Added: cocoon/branches/BRANCH_2_1_X/src/blocks/html/java/org/apache/cocoon/transformation/NekoHTMLTransformer.java
URL: http://svn.apache.org/viewcvs/cocoon/branches/BRANCH_2_1_X/src/blocks/html/java/org/apache/cocoon/transformation/NekoHTMLTransformer.java?rev=388734&view=auto
==============================================================================
--- cocoon/branches/BRANCH_2_1_X/src/blocks/html/java/org/apache/cocoon/transformation/NekoHTMLTransformer.java (added)
+++ cocoon/branches/BRANCH_2_1_X/src/blocks/html/java/org/apache/cocoon/transformation/NekoHTMLTransformer.java Sat Mar 25 01:13:44 2006
@@ -0,0 +1,213 @@
+/*
+ * Copyright 1999-2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cocoon.transformation;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Properties;
+import java.util.StringTokenizer;
+
+import org.apache.avalon.framework.configuration.Configurable;
+import org.apache.avalon.framework.configuration.Configuration;
+import org.apache.avalon.framework.configuration.ConfigurationException;
+import org.apache.avalon.framework.parameters.Parameters;
+import org.apache.cocoon.ProcessingException;
+import org.apache.cocoon.environment.SourceResolver;
+import org.apache.cocoon.xml.dom.DOMBuilder;
+import org.apache.cocoon.xml.IncludeXMLConsumer;
+import org.apache.cocoon.xml.XMLUtils;
+import org.apache.excalibur.source.Source;
+import org.apache.xerces.parsers.AbstractSAXParser;
+import org.cyberneko.html.HTMLConfiguration;
+import org.w3c.dom.Document;
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+/**
+ * Converts (escaped) HTML snippets into tidied HTML using the NekoHTML library.
+ * This transformer expects a list of elements, passed as comma separated
+ * values of the "tags" parameter. It records the text enclosed in such
+ * elements and pass it thru Neko to obtain valid XHTML.
+ *
+ * @version $Id$
+ */
+public class NekoHTMLTransformer
+    extends AbstractSAXTransformer
+    implements Configurable {
+
+    /**
+     * Properties for Neko format
+     */
+    private Properties properties;
+    
+    /**
+     * Tags that must be normalized
+     */
+    private Map tags;
+
+    /**
+     * React on endElement calls that contain a tag to be
+     * tidied and run Neko on it, otherwise passthru.
+     *
+     * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
+     */
+    public void endElement(String uri, String name, String raw)
+        throws SAXException {
+        if (this.tags.containsKey(name)) {
+            String toBeNormalized = this.endTextRecording();
+            try {
+                this.normalize(toBeNormalized);
+            } catch (ProcessingException e) {
+                e.printStackTrace();
+            }
+        }
+        super.endElement(uri, name, raw);
+    }
+
+    /**
+     * Start buffering text if inside a tag to be normalized,
+     * passthru otherwise.
+     *
+     * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
+     */
+    public void startElement(
+        String uri,
+        String name,
+        String raw,
+        Attributes attr)
+        throws SAXException {
+        super.startElement(uri, name, raw, attr);
+		if (this.tags.containsKey(name)) {
+            this.startTextRecording();
+        }
+    }
+
+    /**
+     * Configure this transformer, possibly passing to it
+     * a jtidy configuration file location.
+     */
+    public void configure(Configuration config) throws ConfigurationException {
+        super.configure(config);
+        
+        String configUrl = config.getChild("neko-config").getValue(null);
+        if (configUrl != null) {
+            org.apache.excalibur.source.SourceResolver resolver = null;
+            Source configSource = null;
+            try {
+                resolver = (org.apache.excalibur.source.SourceResolver)
+                           this.manager.lookup(org.apache.excalibur.source.SourceResolver.ROLE);
+                configSource = resolver.resolveURI(configUrl);
+                if (getLogger().isDebugEnabled()) {
+                    getLogger().debug(
+                        "Loading configuration from " + configSource.getURI());
+                }
+                this.properties = new Properties();
+                this.properties.load(configSource.getInputStream());
+
+            } catch (Exception e) {
+                getLogger().warn("Cannot load configuration from " + configUrl);
+                throw new ConfigurationException(
+                    "Cannot load configuration from " + configUrl,
+                    e);
+            } finally {
+                if (null != resolver) {
+                    this.manager.release(resolver);
+                    resolver.release(configSource);
+                }
+            }
+        }
+    }
+
+    /**
+     * The beef: run Neko on the buffered text and stream
+     * the result
+     *
+     * @param text the string to be tidied
+     */
+    private void normalize(String text) throws ProcessingException {
+        try {
+            HtmlSaxParser parser = new HtmlSaxParser(this.properties);
+
+            ByteArrayInputStream bais =
+                new ByteArrayInputStream(text.getBytes());
+            
+            DOMBuilder builder = new DOMBuilder();
+            parser.setContentHandler(builder);
+            parser.parse(new InputSource(bais));
+            Document doc = builder.getDocument();
+
+            IncludeXMLConsumer.includeNode(doc, this.contentHandler, this.lexicalHandler);
+        } catch (Exception e) {
+            throw new ProcessingException(
+                "Exception in NekoHTMLTransformer.normalize()",
+                e);
+        }
+    }
+
+    /**
+     * Setup this component, passing the tag names to be tidied.
+     */
+
+    public void setup(
+        SourceResolver resolver,
+        Map objectModel,
+        String src,
+        Parameters par)
+        throws ProcessingException, SAXException, IOException {
+        super.setup(resolver, objectModel, src, par);
+        String tagsParam = par.getParameter("tags", "");        
+        if (getLogger().isDebugEnabled()) {
+        	getLogger().debug("tags: " + tagsParam);
+        }        
+        this.tags = new HashMap();
+        StringTokenizer tokenizer = new StringTokenizer(tagsParam, ",");
+        while (tokenizer.hasMoreElements()) {
+            String tok = tokenizer.nextToken().trim();
+            this.tags.put(tok, tok);
+        }
+    }
+    
+    public static class HtmlSaxParser extends AbstractSAXParser {
+
+        public HtmlSaxParser(Properties properties) {
+            super(getConfig(properties));
+        }
+    
+        private static HTMLConfiguration getConfig(Properties properties) {
+            HTMLConfiguration config = new HTMLConfiguration();
+            config.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
+            if (properties != null) {
+                for (Iterator i = properties.keySet().iterator();i.hasNext();) {
+                    String name = (String) i.next();
+                    if (name.indexOf("/features/") > -1) {
+                        config.setFeature(name, Boolean.getBoolean(properties.getProperty(name)));
+                    } else if (name.indexOf("/properties/") > -1) {
+                        config.setProperty(name, properties.getProperty(name));
+                    }
+                }
+            }
+            return config;
+        }
+    }
+    
+}

Propchange: cocoon/branches/BRANCH_2_1_X/src/blocks/html/java/org/apache/cocoon/transformation/NekoHTMLTransformer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: cocoon/branches/BRANCH_2_1_X/src/blocks/html/java/org/apache/cocoon/transformation/NekoHTMLTransformer.java
------------------------------------------------------------------------------
    svn:keywords = Id

Modified: cocoon/branches/BRANCH_2_1_X/src/blocks/html/samples/samples.xml
URL: http://svn.apache.org/viewcvs/cocoon/branches/BRANCH_2_1_X/src/blocks/html/samples/samples.xml?rev=388734&r1=388733&r2=388734&view=diff
==============================================================================
--- cocoon/branches/BRANCH_2_1_X/src/blocks/html/samples/samples.xml (original)
+++ cocoon/branches/BRANCH_2_1_X/src/blocks/html/samples/samples.xml Sat Mar 25 01:13:44 2006
@@ -26,13 +26,15 @@
 
   <group name="HTMLGenerator">
     <sample name="Cocoon News Website" href="apache">
-      Shows how to get remote resource and convert it to valid XHTML using HTMLGenerator.
+      Shows how to get remote resource and convert it to valid XHTML using HTMLGenerator.  Tidy produces content in the
+      XHTML namespace.
     </sample>
   </group>
   
   <group name="NekoHTMLGenerator">
     <sample name="Cocoon News Website" href="apache-neko">
-      Shows how to get remote resource and convert it to valid XML using NekoHTMLGenerator.
+      Shows how to get remote resource and convert it to valid XML using NekoHTMLGenerator.  Neko produces content
+      without namespace.
     </sample>
   </group>
 
@@ -42,6 +44,17 @@
       as strings inside elements (as often found in RSS feeds)
     </sample>
     <sample name="Parsed output" href="HTMLTransformer/parsed.xml">
+      HTMLTransformer applied to input.xml: escaped
+      HTML markup is converted to XHTML
+    </sample>
+  </group>
+
+  <group name="NekoHTMLTransformer">
+    <sample name="XML input" href="NekoHTMLTransformer/input.xml">
+      XML document containing escaped HTML, blocks of HTML code written
+      as strings inside elements (as often found in RSS feeds)
+    </sample>
+    <sample name="Parsed output" href="NekoHTMLTransformer/parsed.xml">
       HTMLTransformer applied to input.xml: escaped
       HTML markup is converted to XHTML
     </sample>

Modified: cocoon/branches/BRANCH_2_1_X/src/blocks/html/samples/sitemap.xmap
URL: http://svn.apache.org/viewcvs/cocoon/branches/BRANCH_2_1_X/src/blocks/html/samples/sitemap.xmap?rev=388734&r1=388733&r2=388734&view=diff
==============================================================================
--- cocoon/branches/BRANCH_2_1_X/src/blocks/html/samples/sitemap.xmap (original)
+++ cocoon/branches/BRANCH_2_1_X/src/blocks/html/samples/sitemap.xmap Sat Mar 25 01:13:44 2006
@@ -71,7 +71,8 @@
 
       <map:match pattern="apache-neko">
         <map:generate type="nekohtml" src="http://cocoon.apache.org/news"/>
-        <map:serialize type="xml"/>
+        <map:transform src="stylesheets/apache-no-namespace.xsl"/>
+        <map:serialize type="xhtml"/>
       </map:match>
 
       <!-- ================  HTMLTransformer ================= -->
@@ -90,6 +91,21 @@
         <map:serialize type="xhtml"/>
       </map:match>
 
+      <!-- ================  NekoHTMLTransformer ================= -->
+
+      <map:match pattern="NekoHTMLTransformer/input.xml">
+        <map:generate src="htmltransformer/input.xml"/>
+        <map:serialize type="xml"/>
+      </map:match>
+
+      <map:match pattern="NekoHTMLTransformer/parsed.xml">
+        <map:generate src="htmltransformer/input.xml"/>
+        <map:transform type="nekohtml">
+          <map:parameter name="tags" value="description,escaped-html"/>
+        </map:transform>
+        <map:transform src="htmltransformer/post-transformer-filter.xsl"/>
+        <map:serialize type="xhtml"/>
+      </map:match>
     </map:pipeline>
   </map:pipelines>
 </map:sitemap>

Added: cocoon/branches/BRANCH_2_1_X/src/blocks/html/samples/stylesheets/apache-no-namespace.xsl
URL: http://svn.apache.org/viewcvs/cocoon/branches/BRANCH_2_1_X/src/blocks/html/samples/stylesheets/apache-no-namespace.xsl?rev=388734&view=auto
==============================================================================
--- cocoon/branches/BRANCH_2_1_X/src/blocks/html/samples/stylesheets/apache-no-namespace.xsl (added)
+++ cocoon/branches/BRANCH_2_1_X/src/blocks/html/samples/stylesheets/apache-no-namespace.xsl Sat Mar 25 01:13:44 2006
@@ -0,0 +1,49 @@
+<?xml version="1.0"?>
+<!--
+  Copyright 1999-2004 The Apache Software Foundation
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+
+<xsl:template match="/html">
+  <html>
+    <head>
+      <title><xsl:value-of select="head/title"/></title>
+    </head>
+    <body>
+      <h2><xsl:value-of select="head/title"/></h2>
+      <ul>
+        <xsl:apply-templates select="//div[@class='content']/ul"/>
+      </ul>
+    </body>
+  </html>
+</xsl:template>
+
+<xsl:template match="ul">
+    <ul>
+        <xsl:apply-templates select="li"/>
+    </ul>
+</xsl:template>
+
+<xsl:template match="li">
+    <li><xsl:apply-templates/></li>
+</xsl:template>
+
+<xsl:template match="a">
+    <a href="http://cocoon.apache.org/news/{@href}" title="{@title}">
+      <xsl:value-of select="text()"/>
+    </a>
+</xsl:template>
+
+</xsl:stylesheet>

Propchange: cocoon/branches/BRANCH_2_1_X/src/blocks/html/samples/stylesheets/apache-no-namespace.xsl
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: cocoon/branches/BRANCH_2_1_X/status.xml
URL: http://svn.apache.org/viewcvs/cocoon/branches/BRANCH_2_1_X/status.xml?rev=388734&r1=388733&r2=388734&view=diff
==============================================================================
--- cocoon/branches/BRANCH_2_1_X/status.xml (original)
+++ cocoon/branches/BRANCH_2_1_X/status.xml Sat Mar 25 01:13:44 2006
@@ -180,6 +180,10 @@
   <release version="@version@" date="@date@">
 -->
   <release version="2.1.9" date="TBD">
+    <action dev="JBQ" type="add" fixes-bug="COCOON-1639" due-to="Andrew Stevens" due-to-email="ats37@hotmail.com">
+      Added the NekoHTMLTransformer.  Updated the NekoHTMLGenerator's setup bits to allow for setting parser features as
+      well as properties, and provided a sample neko.properties configuration file.
+    </action>
     <action dev="AG" type="update">
       Updated asm to 2.2.1, asm-util to 2.2.1 groovy to 1.0-jsr-05 and antlr to 2.7.6.
     </action>