You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sling.apache.org by ro...@apache.org on 2017/10/18 23:22:52 UTC

[sling-org-apache-sling-commons-html] 06/36: SLING-1203 : Use tagsoup html parser instead of nekohtml

This is an automated email from the ASF dual-hosted git repository.

rombert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sling-org-apache-sling-commons-html.git

commit 41222cd82995a41cccbe698db623fed00bd7c139
Author: Carsten Ziegeler <cz...@apache.org>
AuthorDate: Wed Nov 25 12:28:19 2009 +0000

    SLING-1203 : Use tagsoup html parser instead of nekohtml
    
    git-svn-id: https://svn.apache.org/repos/asf/sling/trunk@884067 13f79535-47bb-0310-9956-ffa450edef68
---
 NOTICE                                             |   2 +
 pom.xml                                            |  35 +----
 .../apache/sling/commons/html/impl/DOMBuilder.java | 166 +++++++++++++++++++++
 .../sling/commons/html/impl/HtmlParserImpl.java    |  36 ++++-
 .../sling/commons/html/impl/NekohtmlDomParser.java |  62 --------
 .../sling/commons/html/impl/NekohtmlSaxParser.java |  73 ---------
 6 files changed, 206 insertions(+), 168 deletions(-)

diff --git a/NOTICE b/NOTICE
index 9e87228..be0c7d1 100644
--- a/NOTICE
+++ b/NOTICE
@@ -7,3 +7,5 @@ by Day Software (http://www.day.com/).
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).
 
+This product includes software developed at
+http://home.ccil.org/~cowan/XML/tagsoup/
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 1e9bcad..4f6b247 100644
--- a/pom.xml
+++ b/pom.xml
@@ -54,19 +54,13 @@
                 <extensions>true</extensions>
                 <configuration>
                     <instructions>
-                        <Import-Package>
-                            !sun.io,*
-                        </Import-Package>
                         <Export-Package>
                             org.apache.sling.commons.html
                         </Export-Package>
                         <Private-Package>
-                            !org.cyberneko.dtd.ant,
                             org.apache.sling.commons.html.impl,
-                            org.cyberneko.*,
-                            org.apache.xerces.parsers.*,org.apache.xerces.xni.parser.*,
-                            org.apache.html.dom, org.apache.wml, org.apache.xerces.*,
-                            org.apache.wml.dom, org.apache.xml.serialize, org.apache.xml.resolver.*
+                            org.ccil.cowan.tagsoup,
+                            org.ccil.cowan.tagsoup.jaxp
                         </Private-Package>
                     </instructions>
                 </configuration>
@@ -89,29 +83,8 @@
 
     <dependencies>
         <dependency>
-            <groupId>xml-apis</groupId>
-            <artifactId>xml-apis</artifactId>
-            <version>1.3.04</version> 
-            <scope>provided</scope>
-        </dependency>
-        <dependency>
-            <groupId>nekohtml</groupId>
-            <artifactId>nekohtml</artifactId>
-            <version>0.9.5</version>
-        </dependency>
-        <dependency>
-            <groupId>nekohtml</groupId>
-            <artifactId>nekodtd</artifactId>
-            <version>0.1.11</version>    
-        </dependency>
-        <dependency>
-            <groupId>xerces</groupId>
-            <artifactId>xercesImpl</artifactId>
-            <version>2.8.1</version>
-        </dependency>
-        <dependency>
-            <groupId>xml-resolver</groupId>
-            <artifactId>xml-resolver</artifactId>
+            <groupId>org.ccil.cowan.tagsoup</groupId>
+            <artifactId>tagsoup</artifactId>
             <version>1.2</version>
         </dependency>
     </dependencies>
diff --git a/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java b/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
new file mode 100644
index 0000000..375ae9d
--- /dev/null
+++ b/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sling.commons.html.impl;
+
+import java.io.IOException;
+
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMResult;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.ext.LexicalHandler;
+
+/**
+ * The <code>DOMBuilder</code> is a utility class that will generate a W3C
+ * DOM Document from SAX events.
+ *
+ */
+public class DOMBuilder implements ContentHandler, LexicalHandler {
+
+    /** The default transformer factory shared by all instances */
+    private static final SAXTransformerFactory FACTORY = (SAXTransformerFactory) TransformerFactory.newInstance();
+
+    /** The result */
+    private final DOMResult result;
+
+    private final ContentHandler contentHandler;
+    private final LexicalHandler lexicalHandler;
+
+    /**
+     * Construct a new instance of this DOMBuilder.
+     */
+    public DOMBuilder() throws IOException {
+        try {
+            final TransformerHandler handler = FACTORY.newTransformerHandler();
+            this.contentHandler = handler;
+            this.lexicalHandler = handler;
+            this.result = new DOMResult();
+            handler.setResult(this.result);
+        } catch (javax.xml.transform.TransformerException local) {
+            throw (IOException) new IOException("Fatal-Error: Unable to get transformer handler").initCause(local);
+        }
+    }
+
+    /**
+     * Return the newly built Document.
+     */
+    public Document getDocument() {
+        if (this.result.getNode() == null) {
+            return null;
+        } else if (this.result.getNode().getNodeType() == Node.DOCUMENT_NODE) {
+            return (Document) this.result.getNode();
+        } else {
+            return this.result.getNode().getOwnerDocument();
+        }
+    }
+
+    public void setDocumentLocator(Locator locator) {
+        contentHandler.setDocumentLocator(locator);
+    }
+
+    public void startDocument()
+    throws SAXException {
+        contentHandler.startDocument();
+    }
+
+    public void endDocument()
+    throws SAXException {
+        contentHandler.endDocument();
+    }
+
+    public void startPrefixMapping(String prefix, String uri)
+    throws SAXException {
+        contentHandler.startPrefixMapping(prefix, uri);
+    }
+
+    public void endPrefixMapping(String prefix)
+    throws SAXException {
+        contentHandler.endPrefixMapping(prefix);
+    }
+
+    public void startElement(String uri, String loc, String raw, Attributes a)
+    throws SAXException {
+        contentHandler.startElement(uri, loc, raw, a);
+    }
+
+    public void endElement(String uri, String loc, String raw)
+    throws SAXException {
+        contentHandler.endElement(uri, loc, raw);
+    }
+
+    public void characters(char c[], int start, int len)
+    throws SAXException {
+        contentHandler.characters(c, start, len);
+    }
+
+    public void ignorableWhitespace(char c[], int start, int len)
+    throws SAXException {
+        contentHandler.ignorableWhitespace(c, start, len);
+    }
+
+    public void processingInstruction(String target, String data)
+    throws SAXException {
+        contentHandler.processingInstruction(target, data);
+    }
+
+    public void skippedEntity(String name)
+    throws SAXException {
+        contentHandler.skippedEntity(name);
+    }
+
+    public void startDTD(String name, String publicId, String systemId)
+    throws SAXException {
+        lexicalHandler.startDTD(name, publicId, systemId);
+    }
+
+    public void endDTD()
+    throws SAXException {
+        lexicalHandler.endDTD();
+    }
+
+    public void startEntity(String name)
+    throws SAXException {
+        lexicalHandler.startEntity(name);
+    }
+
+    public void endEntity(String name)
+    throws SAXException {
+        lexicalHandler.endEntity(name);
+    }
+
+    public void startCDATA()
+    throws SAXException {
+        lexicalHandler.startCDATA();
+    }
+
+    public void endCDATA()
+    throws SAXException {
+        lexicalHandler.endCDATA();
+    }
+
+    public void comment(char ch[], int start, int len)
+    throws SAXException {
+        lexicalHandler.comment(ch, start, len);
+    }
+}
diff --git a/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java b/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java
index cda543e..ed9a28d 100644
--- a/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java
+++ b/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java
@@ -22,9 +22,12 @@ import java.io.IOException;
 import java.io.InputStream;
 
 import org.apache.sling.commons.html.HtmlParser;
+import org.ccil.cowan.tagsoup.Parser;
 import org.w3c.dom.Document;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
+import org.xml.sax.ext.LexicalHandler;
 
 /**
  * @scr.component metatype="false"
@@ -38,13 +41,42 @@ public class HtmlParserImpl implements HtmlParser {
      */
     public void parse(InputStream stream, String encoding, ContentHandler ch)
     throws SAXException {
-        NekohtmlSaxParser.parse(stream, encoding, ch);
+        final Parser parser = new Parser();
+        if ( ch instanceof LexicalHandler ) {
+            parser.setProperty("http://xml.org/sax/properties/lexical-handler", ch);
+        }
+        parser.setContentHandler(ch);
+        final InputSource source = new InputSource(stream);
+        source.setEncoding(encoding);
+        try {
+            parser.parse(source);
+        } catch (IOException ioe) {
+            throw new SAXException(ioe);
+        }
     }
 
     /**
      * @see org.apache.sling.commons.html.HtmlParser#parse(java.lang.String, java.io.InputStream, java.lang.String)
      */
     public Document parse(String systemId, InputStream stream, String encoding) throws IOException {
-        return NekohtmlDomParser.parse(systemId, stream, encoding);
+        final Parser parser = new Parser();
+
+        final DOMBuilder builder = new DOMBuilder();
+
+        final InputSource source = new InputSource(stream);
+        source.setEncoding(encoding);
+        source.setSystemId(systemId);
+
+        try {
+            parser.setProperty("http://xml.org/sax/properties/lexical-handler", builder);
+            parser.setContentHandler(builder);
+            parser.parse(source);
+        } catch (SAXException se) {
+            if ( se.getCause() instanceof IOException ) {
+                throw (IOException) se.getCause();
+            }
+            throw (IOException) new IOException("Unable to parse xml.").initCause(se);
+        }
+        return builder.getDocument();
     }
 }
diff --git a/src/main/java/org/apache/sling/commons/html/impl/NekohtmlDomParser.java b/src/main/java/org/apache/sling/commons/html/impl/NekohtmlDomParser.java
deleted file mode 100644
index 4bbe349..0000000
--- a/src/main/java/org/apache/sling/commons/html/impl/NekohtmlDomParser.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sling.commons.html.impl;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Iterator;
-import java.util.Properties;
-
-import org.apache.xerces.parsers.AbstractDOMParser;
-import org.apache.xerces.xni.parser.XMLInputSource;
-import org.cyberneko.html.HTMLConfiguration;
-import org.w3c.dom.Document;
-
-/**
- * DOM Parser based on the neko html parser.
- */
-public class NekohtmlDomParser extends AbstractDOMParser {
-
-    public NekohtmlDomParser(Properties properties) {
-        super(getConfig(properties));
-    }
-
-    protected static HTMLConfiguration getConfig(Properties properties) {
-        final HTMLConfiguration config = new HTMLConfiguration();
-        config.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
-        if (properties != null) {
-            for (Iterator<Object> i = properties.keySet().iterator(); i.hasNext();) {
-                final String name = i.next().toString();
-                config.setProperty(name, properties.getProperty(name));
-            }
-        }
-        return config;
-    }
-
-    /**
-     * Parse html.
-     */
-    public static Document parse(String systemId, InputStream stream, String encoding)
-    throws IOException {
-        final NekohtmlDomParser parser = new NekohtmlDomParser(null);
-        XMLInputSource source = new XMLInputSource(null, systemId, null, stream, encoding);
-        parser.parse(source);
-        return parser.getDocument();
-    }
-}
diff --git a/src/main/java/org/apache/sling/commons/html/impl/NekohtmlSaxParser.java b/src/main/java/org/apache/sling/commons/html/impl/NekohtmlSaxParser.java
deleted file mode 100644
index 5eba383..0000000
--- a/src/main/java/org/apache/sling/commons/html/impl/NekohtmlSaxParser.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sling.commons.html.impl;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Iterator;
-import java.util.Properties;
-
-import org.apache.xerces.parsers.AbstractSAXParser;
-import org.cyberneko.html.HTMLConfiguration;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-import org.xml.sax.ext.LexicalHandler;
-
-/**
- * SAX Parser based on the neko html parser.
- */
-public class NekohtmlSaxParser extends AbstractSAXParser {
-
-    public NekohtmlSaxParser(Properties properties) {
-        super(getConfig(properties));
-    }
-
-    protected static HTMLConfiguration getConfig(Properties properties) {
-        final HTMLConfiguration config = new HTMLConfiguration();
-        config.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
-        if (properties != null) {
-            for (Iterator<Object> i = properties.keySet().iterator(); i.hasNext();) {
-                final String name = i.next().toString();
-                config.setProperty(name, properties.getProperty(name));
-            }
-        }
-        return config;
-    }
-
-    /**
-     * Parse html.
-     */
-    public static void parse(InputStream stream, String encoding, ContentHandler ch) throws SAXException {
-        final NekohtmlSaxParser parser = new NekohtmlSaxParser(null);
-        parser.setContentHandler(ch);
-        if (ch instanceof LexicalHandler) {
-            parser.setLexicalHandler((LexicalHandler) ch);
-        }
-        final InputSource is = new InputSource(stream);
-        if ( encoding != null ) {
-            is.setEncoding(encoding);
-        }
-        try {
-            parser.parse(is);
-        } catch (IOException ioe) {
-            throw new SAXException("Error during parsing of html markup.", ioe);
-        }
-    }
-}

-- 
To stop receiving notification emails like this one, please contact
"commits@sling.apache.org" <co...@sling.apache.org>.