You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sling.apache.org by ro...@apache.org on 2017/10/18 23:22:52 UTC
[sling-org-apache-sling-commons-html] 06/36: SLING-1203 : Use
tagsoup html parser instead of nekohtml
This is an automated email from the ASF dual-hosted git repository.
rombert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sling-org-apache-sling-commons-html.git
commit 41222cd82995a41cccbe698db623fed00bd7c139
Author: Carsten Ziegeler <cz...@apache.org>
AuthorDate: Wed Nov 25 12:28:19 2009 +0000
SLING-1203 : Use tagsoup html parser instead of nekohtml
git-svn-id: https://svn.apache.org/repos/asf/sling/trunk@884067 13f79535-47bb-0310-9956-ffa450edef68
---
NOTICE | 2 +
pom.xml | 35 +----
.../apache/sling/commons/html/impl/DOMBuilder.java | 166 +++++++++++++++++++++
.../sling/commons/html/impl/HtmlParserImpl.java | 36 ++++-
.../sling/commons/html/impl/NekohtmlDomParser.java | 62 --------
.../sling/commons/html/impl/NekohtmlSaxParser.java | 73 ---------
6 files changed, 206 insertions(+), 168 deletions(-)
diff --git a/NOTICE b/NOTICE
index 9e87228..be0c7d1 100644
--- a/NOTICE
+++ b/NOTICE
@@ -7,3 +7,5 @@ by Day Software (http://www.day.com/).
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
+This product includes software developed at
+http://home.ccil.org/~cowan/XML/tagsoup/
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 1e9bcad..4f6b247 100644
--- a/pom.xml
+++ b/pom.xml
@@ -54,19 +54,13 @@
<extensions>true</extensions>
<configuration>
<instructions>
- <Import-Package>
- !sun.io,*
- </Import-Package>
<Export-Package>
org.apache.sling.commons.html
</Export-Package>
<Private-Package>
- !org.cyberneko.dtd.ant,
org.apache.sling.commons.html.impl,
- org.cyberneko.*,
- org.apache.xerces.parsers.*,org.apache.xerces.xni.parser.*,
- org.apache.html.dom, org.apache.wml, org.apache.xerces.*,
- org.apache.wml.dom, org.apache.xml.serialize, org.apache.xml.resolver.*
+ org.ccil.cowan.tagsoup,
+ org.ccil.cowan.tagsoup.jaxp
</Private-Package>
</instructions>
</configuration>
@@ -89,29 +83,8 @@
<dependencies>
<dependency>
- <groupId>xml-apis</groupId>
- <artifactId>xml-apis</artifactId>
- <version>1.3.04</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>nekohtml</groupId>
- <artifactId>nekohtml</artifactId>
- <version>0.9.5</version>
- </dependency>
- <dependency>
- <groupId>nekohtml</groupId>
- <artifactId>nekodtd</artifactId>
- <version>0.1.11</version>
- </dependency>
- <dependency>
- <groupId>xerces</groupId>
- <artifactId>xercesImpl</artifactId>
- <version>2.8.1</version>
- </dependency>
- <dependency>
- <groupId>xml-resolver</groupId>
- <artifactId>xml-resolver</artifactId>
+ <groupId>org.ccil.cowan.tagsoup</groupId>
+ <artifactId>tagsoup</artifactId>
<version>1.2</version>
</dependency>
</dependencies>
diff --git a/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java b/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
new file mode 100644
index 0000000..375ae9d
--- /dev/null
+++ b/src/main/java/org/apache/sling/commons/html/impl/DOMBuilder.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sling.commons.html.impl;
+
+import java.io.IOException;
+
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMResult;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.ext.LexicalHandler;
+
+/**
+ * The <code>DOMBuilder</code> is a utility class that will generate a W3C
+ * DOM Document from SAX events.
+ *
+ */
+public class DOMBuilder implements ContentHandler, LexicalHandler {
+
+ /** The default transformer factory shared by all instances */
+ private static final SAXTransformerFactory FACTORY = (SAXTransformerFactory) TransformerFactory.newInstance();
+
+ /** The result */
+ private final DOMResult result;
+
+ private final ContentHandler contentHandler;
+ private final LexicalHandler lexicalHandler;
+
+ /**
+ * Construct a new instance of this DOMBuilder.
+ */
+ public DOMBuilder() throws IOException {
+ try {
+ final TransformerHandler handler = FACTORY.newTransformerHandler();
+ this.contentHandler = handler;
+ this.lexicalHandler = handler;
+ this.result = new DOMResult();
+ handler.setResult(this.result);
+ } catch (javax.xml.transform.TransformerException local) {
+ throw (IOException) new IOException("Fatal-Error: Unable to get transformer handler").initCause(local);
+ }
+ }
+
+ /**
+ * Return the newly built Document.
+ */
+ public Document getDocument() {
+ if (this.result.getNode() == null) {
+ return null;
+ } else if (this.result.getNode().getNodeType() == Node.DOCUMENT_NODE) {
+ return (Document) this.result.getNode();
+ } else {
+ return this.result.getNode().getOwnerDocument();
+ }
+ }
+
+ public void setDocumentLocator(Locator locator) {
+ contentHandler.setDocumentLocator(locator);
+ }
+
+ public void startDocument()
+ throws SAXException {
+ contentHandler.startDocument();
+ }
+
+ public void endDocument()
+ throws SAXException {
+ contentHandler.endDocument();
+ }
+
+ public void startPrefixMapping(String prefix, String uri)
+ throws SAXException {
+ contentHandler.startPrefixMapping(prefix, uri);
+ }
+
+ public void endPrefixMapping(String prefix)
+ throws SAXException {
+ contentHandler.endPrefixMapping(prefix);
+ }
+
+ public void startElement(String uri, String loc, String raw, Attributes a)
+ throws SAXException {
+ contentHandler.startElement(uri, loc, raw, a);
+ }
+
+ public void endElement(String uri, String loc, String raw)
+ throws SAXException {
+ contentHandler.endElement(uri, loc, raw);
+ }
+
+ public void characters(char c[], int start, int len)
+ throws SAXException {
+ contentHandler.characters(c, start, len);
+ }
+
+ public void ignorableWhitespace(char c[], int start, int len)
+ throws SAXException {
+ contentHandler.ignorableWhitespace(c, start, len);
+ }
+
+ public void processingInstruction(String target, String data)
+ throws SAXException {
+ contentHandler.processingInstruction(target, data);
+ }
+
+ public void skippedEntity(String name)
+ throws SAXException {
+ contentHandler.skippedEntity(name);
+ }
+
+ public void startDTD(String name, String publicId, String systemId)
+ throws SAXException {
+ lexicalHandler.startDTD(name, publicId, systemId);
+ }
+
+ public void endDTD()
+ throws SAXException {
+ lexicalHandler.endDTD();
+ }
+
+ public void startEntity(String name)
+ throws SAXException {
+ lexicalHandler.startEntity(name);
+ }
+
+ public void endEntity(String name)
+ throws SAXException {
+ lexicalHandler.endEntity(name);
+ }
+
+ public void startCDATA()
+ throws SAXException {
+ lexicalHandler.startCDATA();
+ }
+
+ public void endCDATA()
+ throws SAXException {
+ lexicalHandler.endCDATA();
+ }
+
+ public void comment(char ch[], int start, int len)
+ throws SAXException {
+ lexicalHandler.comment(ch, start, len);
+ }
+}
diff --git a/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java b/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java
index cda543e..ed9a28d 100644
--- a/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java
+++ b/src/main/java/org/apache/sling/commons/html/impl/HtmlParserImpl.java
@@ -22,9 +22,12 @@ import java.io.IOException;
import java.io.InputStream;
import org.apache.sling.commons.html.HtmlParser;
+import org.ccil.cowan.tagsoup.Parser;
import org.w3c.dom.Document;
import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
+import org.xml.sax.ext.LexicalHandler;
/**
* @scr.component metatype="false"
@@ -38,13 +41,42 @@ public class HtmlParserImpl implements HtmlParser {
*/
public void parse(InputStream stream, String encoding, ContentHandler ch)
throws SAXException {
- NekohtmlSaxParser.parse(stream, encoding, ch);
+ final Parser parser = new Parser();
+ if ( ch instanceof LexicalHandler ) {
+ parser.setProperty("http://xml.org/sax/properties/lexical-handler", ch);
+ }
+ parser.setContentHandler(ch);
+ final InputSource source = new InputSource(stream);
+ source.setEncoding(encoding);
+ try {
+ parser.parse(source);
+ } catch (IOException ioe) {
+ throw new SAXException(ioe);
+ }
}
/**
* @see org.apache.sling.commons.html.HtmlParser#parse(java.lang.String, java.io.InputStream, java.lang.String)
*/
public Document parse(String systemId, InputStream stream, String encoding) throws IOException {
- return NekohtmlDomParser.parse(systemId, stream, encoding);
+ final Parser parser = new Parser();
+
+ final DOMBuilder builder = new DOMBuilder();
+
+ final InputSource source = new InputSource(stream);
+ source.setEncoding(encoding);
+ source.setSystemId(systemId);
+
+ try {
+ parser.setProperty("http://xml.org/sax/properties/lexical-handler", builder);
+ parser.setContentHandler(builder);
+ parser.parse(source);
+ } catch (SAXException se) {
+ if ( se.getCause() instanceof IOException ) {
+ throw (IOException) se.getCause();
+ }
+ throw (IOException) new IOException("Unable to parse xml.").initCause(se);
+ }
+ return builder.getDocument();
}
}
diff --git a/src/main/java/org/apache/sling/commons/html/impl/NekohtmlDomParser.java b/src/main/java/org/apache/sling/commons/html/impl/NekohtmlDomParser.java
deleted file mode 100644
index 4bbe349..0000000
--- a/src/main/java/org/apache/sling/commons/html/impl/NekohtmlDomParser.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sling.commons.html.impl;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Iterator;
-import java.util.Properties;
-
-import org.apache.xerces.parsers.AbstractDOMParser;
-import org.apache.xerces.xni.parser.XMLInputSource;
-import org.cyberneko.html.HTMLConfiguration;
-import org.w3c.dom.Document;
-
-/**
- * DOM Parser based on the neko html parser.
- */
-public class NekohtmlDomParser extends AbstractDOMParser {
-
- public NekohtmlDomParser(Properties properties) {
- super(getConfig(properties));
- }
-
- protected static HTMLConfiguration getConfig(Properties properties) {
- final HTMLConfiguration config = new HTMLConfiguration();
- config.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
- if (properties != null) {
- for (Iterator<Object> i = properties.keySet().iterator(); i.hasNext();) {
- final String name = i.next().toString();
- config.setProperty(name, properties.getProperty(name));
- }
- }
- return config;
- }
-
- /**
- * Parse html.
- */
- public static Document parse(String systemId, InputStream stream, String encoding)
- throws IOException {
- final NekohtmlDomParser parser = new NekohtmlDomParser(null);
- XMLInputSource source = new XMLInputSource(null, systemId, null, stream, encoding);
- parser.parse(source);
- return parser.getDocument();
- }
-}
diff --git a/src/main/java/org/apache/sling/commons/html/impl/NekohtmlSaxParser.java b/src/main/java/org/apache/sling/commons/html/impl/NekohtmlSaxParser.java
deleted file mode 100644
index 5eba383..0000000
--- a/src/main/java/org/apache/sling/commons/html/impl/NekohtmlSaxParser.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.sling.commons.html.impl;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Iterator;
-import java.util.Properties;
-
-import org.apache.xerces.parsers.AbstractSAXParser;
-import org.cyberneko.html.HTMLConfiguration;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-import org.xml.sax.ext.LexicalHandler;
-
-/**
- * SAX Parser based on the neko html parser.
- */
-public class NekohtmlSaxParser extends AbstractSAXParser {
-
- public NekohtmlSaxParser(Properties properties) {
- super(getConfig(properties));
- }
-
- protected static HTMLConfiguration getConfig(Properties properties) {
- final HTMLConfiguration config = new HTMLConfiguration();
- config.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
- if (properties != null) {
- for (Iterator<Object> i = properties.keySet().iterator(); i.hasNext();) {
- final String name = i.next().toString();
- config.setProperty(name, properties.getProperty(name));
- }
- }
- return config;
- }
-
- /**
- * Parse html.
- */
- public static void parse(InputStream stream, String encoding, ContentHandler ch) throws SAXException {
- final NekohtmlSaxParser parser = new NekohtmlSaxParser(null);
- parser.setContentHandler(ch);
- if (ch instanceof LexicalHandler) {
- parser.setLexicalHandler((LexicalHandler) ch);
- }
- final InputSource is = new InputSource(stream);
- if ( encoding != null ) {
- is.setEncoding(encoding);
- }
- try {
- parser.parse(is);
- } catch (IOException ioe) {
- throw new SAXException("Error during parsing of html markup.", ioe);
- }
- }
-}
--
To stop receiving notification emails like this one, please contact
"commits@sling.apache.org" <co...@sling.apache.org>.