You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@abdera.apache.org by jm...@apache.org on 2007/10/23 18:28:58 UTC
svn commit: r587550 [2/6] - in
/incubator/abdera/java/trunk/extensions/json/src/main: java/nu/
java/nu/validator/ java/nu/validator/htmlparser/
java/nu/validator/htmlparser/common/ java/nu/validator/htmlparser/impl/
java/nu/validator/htmlparser/sax/ ja...
Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/MetaSniffer.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/MetaSniffer.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/MetaSniffer.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/MetaSniffer.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,566 @@
+/*
+ * Copyright (c) 2007 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.impl;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+import org.xml.sax.ErrorHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+
+public final class MetaSniffer implements Locator {
+
+ private class StopSniffingException extends Exception {
+
+ }
+
+ private static final Pattern CONTENT = Pattern.compile("^[^;]*;[\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*[cC][hH][aA][rR][sS][eE][tT][\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*=[\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*(?:(?:([^'\"\\x09\\x0A\\x0B\\x0C\\x0D\\x20][^\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*)(?:[\\x09\\x0A\\x0B\\x0C\\x0D\\x20].*)?)|(?:\"([^\"]*)\".*)|(?:'([^']*)'.*))$", Pattern.DOTALL);
+
+ private enum MetaState {
+ NO, M, E, T, A
+ }
+
+ private final ByteReadable source;
+
+ private final ErrorHandler errorHandler;
+
+ private CharsetDecoder charsetDecoder = null;
+
+ private StringBuilder attributeName = new StringBuilder();
+
+ private StringBuilder attributeValue = new StringBuilder();
+
+ private MetaState metaState = MetaState.NO;
+
+ private int unread = -1;
+
+ private int line = 1;
+
+ private int col = 0;
+
+ private boolean prevWasCR = false;
+
+ private final Locator locator;
+
+ /**
+ * @param source
+ * @param errorHandler
+ * @param publicId
+ * @param systemId
+ */
+ public MetaSniffer(ByteReadable source, ErrorHandler eh, Locator locator) {
+ this.source = source;
+ this.errorHandler = eh;
+ this.locator = locator;
+ }
+
+ // Making this method return an int instead of a char was
+ // probably a mistake :-(
+ private int read() throws IOException, StopSniffingException {
+ if (unread == -1) {
+ int b = source.readByte();
+ switch (b) {
+ case -1: // end
+ throw new StopSniffingException();
+ case 0x0A: // LF
+ if (!prevWasCR) {
+ line++;
+ col = 0;
+ }
+ prevWasCR = false;
+ break;
+ case 0x0D: // CR
+ line++;
+ col = 0;
+ prevWasCR = true;
+ break;
+ default:
+ col++;
+ prevWasCR = false;
+ break;
+ }
+ return b;
+ } else {
+ int b = unread;
+ unread = -1;
+ return b;
+ }
+ }
+
+ private void unread(int b) {
+ this.unread = b;
+ }
+
+ /**
+ * Main loop.
+ *
+ * @return
+ *
+ * @throws SAXException
+ * @throws IOException
+ * @throws
+ */
+ public CharsetDecoder sniff() throws SAXException, IOException {
+ try {
+ for (;;) {
+ if (read() == 0x3C) { // <
+ markup();
+ }
+ }
+ } catch (StopSniffingException e) {
+ return charsetDecoder;
+ }
+ }
+
+ /**
+ * <
+ *
+ * @throws SAXException
+ * @throws StopSniffingException
+ * @throws IOException
+ */
+ private void markup() throws SAXException, StopSniffingException, IOException {
+ int b = read();
+ if (b == 0x21) { // !
+ markupDecl();
+ } else if (b == 0x2F) { // /
+ endTag();
+ } else if (b == 0x3F) { // ?
+ consumeUntilAndIncludingGt();
+ } else if (b == 0x4D || b == 0x6D) { // m or M
+ metaState = MetaState.M;
+ tag();
+ } else if ((b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A)) { // ASCII
+ // letter
+ metaState = MetaState.NO;
+ tag();
+ }
+ }
+
+ /**
+ * < , x
+ *
+ * @throws SAXException
+ * @throws StopSniffingException
+ * @throws IOException
+ */
+ private void tag() throws SAXException, StopSniffingException, IOException {
+ int b;
+ loop: for (;;) {
+ b = read();
+ switch (b) {
+ case 0x09: // tab
+ case 0x0A: // LF
+ case 0x0B: // VT
+ case 0x0C: // FF
+ case 0x0D: // CR
+ case 0x20: // space
+ case 0x3E: // >
+ case 0x3C: // <
+ break loop;
+ case 0x45: // E
+ case 0x65: // e
+ if (metaState == MetaState.M) {
+ metaState = MetaState.E;
+ } else {
+ metaState = MetaState.NO;
+ }
+ continue loop;
+ case 0x54: // T
+ case 0x74: // t
+ if (metaState == MetaState.E) {
+ metaState = MetaState.T;
+ } else {
+ metaState = MetaState.NO;
+ }
+ continue loop;
+ case 0x41: // A
+ case 0x61: // a
+ if (metaState == MetaState.T) {
+ metaState = MetaState.A;
+ } else {
+ metaState = MetaState.NO;
+ }
+ continue loop;
+ default:
+ metaState = MetaState.NO;
+ continue loop;
+ }
+ }
+ unread(b);
+ if (b != 0x3C) {
+ while (attribute())
+ ;
+ }
+ }
+
+ /**
+ * The "get an attribute" subalgorithm.
+ *
+ * @return <code>false</code> when to stop
+ * @throws SAXException
+ * @throws StopSniffingException
+ * @throws IOException
+ */
+ private boolean attribute() throws SAXException, StopSniffingException, IOException {
+ int b;
+ loop: for (;;) {
+ b = read();
+ switch (b) {
+ case 0x09: // tab
+ case 0x0A: // LF
+ case 0x0B: // VT
+ case 0x0C: // FF
+ case 0x0D: // CR
+ case 0x20: // space
+ case 0x2F: // /
+ continue loop;
+ default:
+ break loop;
+ }
+ }
+ if (b == 0x3C) { // <
+ unread(b);
+ return false;
+ }
+ if (b == 0x3E) { // >
+ return false;
+ }
+ attributeName.setLength(0);
+ attributeValue.setLength(0);
+ unread(b); // this is a bit ugly
+ name: for (;;) {
+ b = read();
+ switch (b) {
+ case 0x3D: // =
+ // not actually advancing here yet
+ break name;
+ case 0x09: // tab
+ case 0x0A: // LF
+ case 0x0B: // VT
+ case 0x0C: // FF
+ case 0x0D: // CR
+ case 0x20: // space
+ spaces: for (;;) {
+ b = read();
+ switch (b) {
+ case 0x09: // tab
+ case 0x0A: // LF
+ case 0x0B: // VT
+ case 0x0C: // FF
+ case 0x0D: // CR
+ case 0x20: // space
+ continue spaces;
+ default:
+ break name;
+ }
+ }
+ case 0x2f: // /
+ return true;
+ case 0x3C: // <
+ unread(b);
+ return false;
+ case 0x3E: // >
+ return false;
+ default:
+ if (metaState == MetaState.A) {
+ // could use a highly-efficient state machine
+ // here instead of a buffer...
+ if (b >= 0x41 && b <= 0x5A) {
+ attributeName.append((char) (b + 0x20));
+ } else {
+ attributeName.append((char) b);
+ }
+ }
+ continue name;
+ }
+ }
+ if (b != 0x3D) {
+ // "If the byte at position is not 0x3D (ASCII '='), stop looking
+ // for
+ // an attribute. Move position back to the previous byte."
+ unread(b);
+ return true;
+ }
+ value: for (;;) {
+ b = read();
+ switch (b) {
+ case 0x09: // tab
+ case 0x0A: // LF
+ case 0x0B: // VT
+ case 0x0C: // FF
+ case 0x0D: // CR
+ case 0x20: // space
+ continue value;
+ default:
+ break value;
+ }
+ }
+ switch (b) {
+ case 0x22: // "
+ quotedAttribute(0x22);
+ return true;
+ case 0x27: // '
+ quotedAttribute(0x27);
+ return true;
+ case 0x3C: // <
+ unread(b);
+ return false;
+ case 0x3E: // >
+ return false;
+ default:
+ unread(b);
+ return unquotedAttribute();
+ }
+ }
+
+ private boolean unquotedAttribute() throws SAXException, StopSniffingException, IOException {
+ int b;
+ for (;;) {
+ b = read();
+ switch (b) {
+ case 0x09: // tab
+ case 0x0A: // LF
+ case 0x0B: // VT
+ case 0x0C: // FF
+ case 0x0D: // CR
+ case 0x20: // space
+ checkAttribute();
+ return true;
+ case 0x3E: // >
+ checkAttribute();
+ return false;
+ case 0x3C: // <
+ checkAttribute();
+ unread(b);
+ return false;
+ default:
+ // omitting uppercasing
+ if (metaState == MetaState.A) {
+ attributeValue.append((char) b);
+ }
+ break;
+ }
+ }
+ }
+
+ private void checkAttribute() throws SAXException, StopSniffingException {
+ if (metaState == MetaState.A) {
+ String name = attributeName.toString();
+ if ("charset".equals(name)) {
+ // XXX revisit trim() to trime only space characters
+ tryCharset(attributeValue.toString().trim());
+ } else if ("content".equals(name)) {
+ Matcher m = CONTENT.matcher(attributeValue);
+ if (m.matches()) {
+ String value = null;
+ for (int i = 1; i < 4; i++) {
+ value = m.group(i);
+ if (value != null) {
+ tryCharset(value);
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private void tryCharset(String encoding) throws SAXException, StopSniffingException {
+ encoding = encoding.toUpperCase();
+ try {
+ // XXX deviating from the spec as per mjs on IRC.
+ if ("UTF-16".equals(encoding) || "UTF-16BE".equals(encoding) || "UTF-16LE".equals(encoding) || "UTF-32".equals(encoding) || "UTF-32BE".equals(encoding) || "UTF-32LE".equals(encoding)) {
+ this.charsetDecoder = Charset.forName("UTF-8").newDecoder();
+ err("The internal character encoding declaration specified \u201C" + encoding + "\u201D which is not a rough superset of ASCII. Using \u201CUTF-8\u201D instead.");
+ throw new StopSniffingException();
+ } else {
+ Charset cs = Charset.forName(encoding);
+ String canonName = cs.name();
+ if (!EncodingInfo.isAsciiSuperset(canonName)) {
+ err("The encoding \u201C"
+ + encoding
+ + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration. Continuing the sniffing algorithm.");
+ return;
+ }
+ if (canonName.startsWith("X-") || canonName.startsWith("x-")
+ || canonName.startsWith("Mac")) {
+ if (encoding.startsWith("X-")) {
+ err("The encoding \u201C" + encoding
+ + "\u201D is not an IANA-registered encoding. (Charmod C022)");
+ } else {
+ err("The encoding \u201C" + encoding
+ + "\u201D is not an IANA-registered encoding and did\u2019t start with \u201CX-\u201D. (Charmod C023)");
+ }
+ } else if (!canonName.equalsIgnoreCase(encoding)) {
+ err("The encoding \u201C" + encoding
+ + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
+ + canonName + "\u201D. (Charmod C024)");
+ }
+ if (EncodingInfo.isObscure(canonName)) {
+ warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
+ }
+ this.charsetDecoder = cs.newDecoder();
+ throw new StopSniffingException();
+ }
+ } catch (IllegalCharsetNameException e) {
+ err("Illegal character encoding name: \u201C" + encoding + "\u201D. Will continue sniffing.");
+ } catch (UnsupportedCharsetException e) {
+ err("Unsupported character encoding name: \u201C" + encoding + "\u201D. Will continue sniffing.");
+ }
+ }
+
+ /**
+ * @param string
+ * @throws SAXException
+ */
+ private void err(String message) throws SAXException {
+ if (errorHandler != null) {
+ SAXParseException spe = new SAXParseException(message, this);
+ errorHandler.error(spe);
+ }
+ }
+
+ /**
+ * @param string
+ * @throws SAXException
+ */
+ private void warn(String message) throws SAXException {
+ if (errorHandler != null) {
+ SAXParseException spe = new SAXParseException(message, this);
+ errorHandler.warning(spe);
+ }
+ }
+
+ private void quotedAttribute(int delim) throws SAXException, StopSniffingException, IOException {
+ int b;
+ for (;;) {
+ b = read();
+ if (b == delim) {
+ checkAttribute();
+ return;
+ } else {
+ if (metaState == MetaState.A) {
+ attributeValue.append((char) b);
+ }
+ }
+ }
+ }
+
+ private void consumeUntilAndIncludingGt() throws IOException, StopSniffingException {
+ for (;;) {
+ if (read() == 0x3E) { // >
+ return;
+ }
+ }
+ }
+
+ /**
+ * Seen < , /
+ *
+ * @throws SAXException
+ * @throws StopSniffingException
+ * @throws IOException
+ */
+ private void endTag() throws SAXException, StopSniffingException, IOException {
+ int b = read();
+ if ((b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A)) { // ASCII
+ // letter
+ metaState = MetaState.NO;
+ tag();
+ } else {
+ consumeUntilAndIncludingGt();
+ }
+ }
+
+ /**
+ * Seen < , !
+ * @throws IOException
+ * @throws StopSniffingException
+ */
+ private void markupDecl() throws IOException, StopSniffingException {
+ if (read() == 0x2D) { // -
+ comment();
+ } else {
+ consumeUntilAndIncludingGt();
+ }
+ }
+
+ /**
+ * Seen < , ! , -
+ * @throws IOException
+ * @throws StopSniffingException
+ */
+ private void comment() throws IOException, StopSniffingException {
+ if (read() == 0x2D) { // -
+ int hyphensSeen = 2;
+ for (;;) {
+ int b = read();
+ if (b == 0x2D) { // -
+ hyphensSeen++;
+ } else if (b == 0x3E) { // >
+ if (hyphensSeen >= 2) {
+ return;
+ } else {
+ hyphensSeen = 0;
+ }
+ } else {
+ hyphensSeen = 0;
+ }
+ }
+ } else {
+ consumeUntilAndIncludingGt();
+ }
+ }
+
+ public int getColumnNumber() {
+ return col;
+ }
+
+ public int getLineNumber() {
+ return line;
+ }
+
+ public String getPublicId() {
+ if (locator != null) {
+ return locator.getPublicId();
+ }
+ return null;
+ }
+
+ public String getSystemId() {
+ if (locator != null) {
+ return locator.getSystemId();
+ }
+ return null;
+ }
+
+}
Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/TokenHandler.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/TokenHandler.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/TokenHandler.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/TokenHandler.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2007 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.impl;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+/**
+ * <code>Tokenizer</code> reports tokens through this interface.
+ *
+ * @version $Id: TokenHandler.java 150 2007-08-16 19:21:25Z hsivonen $
+ * @author hsivonen
+ */
+public interface TokenHandler {
+
+ /**
+ * This method is called at the start of tokenization before any other
+ * methods on this interface are called. Implementations should hold
+ * the reference to the <code>Tokenizer</code> in order to set the
+ * content model flag and in order to be able to query for
+ * <code>Locator</code> data.
+ *
+ * @param self the <code>Tokenizer</code>.
+ * @throws SAXException if something went wrong
+ */
+ public void start(Tokenizer self) throws SAXException;
+
+ /**
+ * If this handler implementation cares about comments, return <code>true</code>.
+ * If not, return <code>false</code>.
+ *
+ * @return whether this handler wants comments
+ * @throws SAXException if something went wrong
+ */
+ public boolean wantsComments() throws SAXException;
+
+ /**
+ * Receive a doctype token.
+ *
+ * @param name the name
+ * @param publicIdentifier the public id
+ * @param systemIdentifier the system id
+ * @param correct whether the token is correct
+ * @throws SAXException if something went wrong
+ */
+ public void doctype(String name, String publicIdentifier, String systemIdentifier, boolean correct) throws SAXException;
+
+ /**
+ * Receive a start tag token.
+ *
+ * @param name the tag name
+ * @param attributes the attributes
+ * @throws SAXException if something went wrong
+ */
+ public void startTag(String name, Attributes attributes) throws SAXException;
+
+ /**
+ * Receive an end tag token.
+ *
+ * @param name the tag name
+ * @param attributes the attributes
+ * @throws SAXException if something went wrong
+ */
+ public void endTag(String name, Attributes attributes) throws SAXException;
+
+ /**
+ * Receive a comment token. The data is junk if the <code>wantsComments()</code>
+ * returned <code>false</code>.
+ *
+ * @param buf a buffer holding the data
+ * @param length the number of code units to read
+ * @throws SAXException if something went wrong
+ */
+ public void comment(char[] buf, int length) throws SAXException;
+
+ /**
+ * Receive character tokens. This method has the same semantics as
+ * the SAX method of the same name.
+ *
+ * @param buf a buffer holding the data
+ * @param start offset into the buffer
+ * @param length the number of code units to read
+ * @throws SAXException if something went wrong
+ * @see org.xml.sax.ContentHandler#characters(char[], int, int)
+ */
+ public void characters(char[] buf, int start, int length) throws SAXException;
+
+ /**
+ * The end-of-file token.
+ *
+ * @throws SAXException if something went wrong
+ */
+ public void eof() throws SAXException;
+
+}