You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@abdera.apache.org by jm...@apache.org on 2007/10/23 18:28:58 UTC

svn commit: r587550 [2/6] - in /incubator/abdera/java/trunk/extensions/json/src/main: java/nu/ java/nu/validator/ java/nu/validator/htmlparser/ java/nu/validator/htmlparser/common/ java/nu/validator/htmlparser/impl/ java/nu/validator/htmlparser/sax/ ja...

Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/MetaSniffer.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/MetaSniffer.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/MetaSniffer.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/MetaSniffer.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,566 @@
+/*
+ * Copyright (c) 2007 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.impl;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+import org.xml.sax.ErrorHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+
+public final class MetaSniffer implements Locator {
+
+    private class StopSniffingException extends Exception {
+
+    }
+
+    private static final Pattern CONTENT = Pattern.compile("^[^;]*;[\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*[cC][hH][aA][rR][sS][eE][tT][\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*=[\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*(?:(?:([^'\"\\x09\\x0A\\x0B\\x0C\\x0D\\x20][^\\x09\\x0A\\x0B\\x0C\\x0D\\x20]*)(?:[\\x09\\x0A\\x0B\\x0C\\x0D\\x20].*)?)|(?:\"([^\"]*)\".*)|(?:'([^']*)'.*))$", Pattern.DOTALL);
+    
+    private enum MetaState {
+        NO, M, E, T, A
+    }
+
+    private final ByteReadable source;
+    
+    private final ErrorHandler errorHandler;
+    
+    private CharsetDecoder charsetDecoder = null;
+    
+    private StringBuilder attributeName = new StringBuilder();
+
+    private StringBuilder attributeValue = new StringBuilder();
+
+    private MetaState metaState = MetaState.NO;
+
+    private int unread = -1;
+
+    private int line = 1;
+    
+    private int col = 0;
+    
+    private boolean prevWasCR = false;
+
+    private final Locator locator;
+    
+    /**
+     * @param source
+     * @param errorHandler
+     * @param publicId
+     * @param systemId
+     */
+    public MetaSniffer(ByteReadable source, ErrorHandler eh, Locator locator) {
+        this.source = source;
+        this.errorHandler = eh;
+        this.locator = locator;
+    }
+
+    // Making this method return an int instead of a char was
+    // probably a mistake :-(
+    private int read() throws IOException, StopSniffingException {
+        if (unread == -1) {
+            int b = source.readByte();
+            switch (b) {
+                case -1: // end
+                    throw new StopSniffingException();
+                case 0x0A: // LF
+                    if (!prevWasCR) {
+                        line++;
+                        col = 0;
+                    }
+                    prevWasCR = false;
+                    break;
+                case 0x0D: // CR
+                    line++;
+                    col = 0;
+                    prevWasCR = true;
+                    break;
+                default:
+                    col++;
+                    prevWasCR = false;
+                    break;
+            }
+            return b;
+        } else {
+            int b = unread;
+            unread = -1;
+            return b;
+        }
+    }
+
+    private void unread(int b) {
+        this.unread = b;
+    }
+
+    /**
+     * Main loop.
+     * 
+     * @return
+     * 
+     * @throws SAXException
+     * @throws IOException
+     * @throws
+     */
+    public CharsetDecoder sniff() throws SAXException, IOException {
+        try {
+            for (;;) {
+                if (read() == 0x3C) { // <
+                    markup();
+                }
+            }
+        } catch (StopSniffingException e) {
+            return charsetDecoder;
+        }
+    }
+
+    /**
+     * <
+     * 
+     * @throws SAXException
+     * @throws StopSniffingException 
+     * @throws IOException 
+     */
+    private void markup() throws SAXException, StopSniffingException, IOException {
+        int b = read();
+        if (b == 0x21) { // !
+            markupDecl();
+        } else if (b == 0x2F) { // /
+            endTag();
+        } else if (b == 0x3F) { // ?
+            consumeUntilAndIncludingGt();
+        } else if (b == 0x4D || b == 0x6D) { // m or M
+            metaState = MetaState.M;
+            tag();
+        } else if ((b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A)) { // ASCII
+                                                                            // letter
+            metaState = MetaState.NO;
+            tag();
+        }
+    }
+
+    /**
+     * < , x
+     * 
+     * @throws SAXException
+     * @throws StopSniffingException 
+     * @throws IOException 
+     */
+    private void tag() throws SAXException, StopSniffingException, IOException {
+        int b;
+        loop: for (;;) {
+            b = read();
+            switch (b) {
+                case 0x09: // tab
+                case 0x0A: // LF
+                case 0x0B: // VT
+                case 0x0C: // FF
+                case 0x0D: // CR
+                case 0x20: // space
+                case 0x3E: // >
+                case 0x3C: // <
+                    break loop;
+                case 0x45: // E
+                case 0x65: // e
+                    if (metaState == MetaState.M) {
+                        metaState = MetaState.E;
+                    } else {
+                        metaState = MetaState.NO;
+                    }
+                    continue loop;
+                case 0x54: // T
+                case 0x74: // t
+                    if (metaState == MetaState.E) {
+                        metaState = MetaState.T;
+                    } else {
+                        metaState = MetaState.NO;
+                    }
+                    continue loop;
+                case 0x41: // A
+                case 0x61: // a
+                    if (metaState == MetaState.T) {
+                        metaState = MetaState.A;
+                    } else {
+                        metaState = MetaState.NO;
+                    }
+                    continue loop;
+                default:
+                    metaState = MetaState.NO;
+                    continue loop;
+            }
+        }
+        unread(b);
+        if (b != 0x3C) {
+            while (attribute())
+                ;
+        }
+    }
+
+    /**
+     * The "get an attribute" subalgorithm.
+     * 
+     * @return <code>false</code> when to stop
+     * @throws SAXException
+     * @throws StopSniffingException 
+     * @throws IOException 
+     */
+    private boolean attribute() throws SAXException, StopSniffingException, IOException {
+        int b;
+        loop: for (;;) {
+            b = read();
+            switch (b) {
+                case 0x09: // tab
+                case 0x0A: // LF
+                case 0x0B: // VT
+                case 0x0C: // FF
+                case 0x0D: // CR
+                case 0x20: // space
+                case 0x2F: // /
+                    continue loop;
+                default:
+                    break loop;
+            }
+        }
+        if (b == 0x3C) { // <
+            unread(b);
+            return false;
+        }
+        if (b == 0x3E) { // >
+            return false;
+        }
+        attributeName.setLength(0);
+        attributeValue.setLength(0);
+        unread(b); // this is a bit ugly
+        name: for (;;) {
+            b = read();
+            switch (b) {
+                case 0x3D: // =
+                    // not actually advancing here yet
+                    break name;
+                case 0x09: // tab
+                case 0x0A: // LF
+                case 0x0B: // VT
+                case 0x0C: // FF
+                case 0x0D: // CR
+                case 0x20: // space
+                    spaces: for (;;) {
+                        b = read();
+                        switch (b) {
+                            case 0x09: // tab
+                            case 0x0A: // LF
+                            case 0x0B: // VT
+                            case 0x0C: // FF
+                            case 0x0D: // CR
+                            case 0x20: // space
+                                continue spaces;
+                            default:
+                                break name;
+                        }
+                    }
+                case 0x2f: // /
+                    return true;
+                case 0x3C: // <
+                    unread(b);
+                    return false;
+                case 0x3E: // >
+                    return false;
+                default:
+                    if (metaState == MetaState.A) {
+                        // could use a highly-efficient state machine
+                        // here instead of a buffer...
+                        if (b >= 0x41 && b <= 0x5A) {
+                            attributeName.append((char) (b + 0x20));
+                        } else {
+                            attributeName.append((char) b);
+                        }
+                    }
+                    continue name;
+            }
+        }
+        if (b != 0x3D) {
+            // "If the byte at position is not 0x3D (ASCII '='), stop looking
+            // for
+            // an attribute. Move position back to the previous byte."
+            unread(b);
+            return true;
+        }
+        value: for (;;) {
+            b = read();
+            switch (b) {
+                case 0x09: // tab
+                case 0x0A: // LF
+                case 0x0B: // VT
+                case 0x0C: // FF
+                case 0x0D: // CR
+                case 0x20: // space
+                    continue value;
+                default:
+                    break value;
+            }
+        }
+        switch (b) {
+            case 0x22: // "
+                quotedAttribute(0x22);
+                return true;
+            case 0x27: // '
+                quotedAttribute(0x27);
+                return true;
+            case 0x3C: // <
+                unread(b);
+                return false;
+            case 0x3E: // >
+                return false;
+            default:
+                unread(b);
+                return unquotedAttribute();
+        }
+    }
+
+    private boolean unquotedAttribute() throws SAXException, StopSniffingException, IOException {
+        int b;
+        for (;;) {
+            b = read();
+            switch (b) {
+                case 0x09: // tab
+                case 0x0A: // LF
+                case 0x0B: // VT
+                case 0x0C: // FF
+                case 0x0D: // CR
+                case 0x20: // space
+                    checkAttribute();
+                    return true;
+                case 0x3E: // >
+                    checkAttribute();
+                    return false;
+                case 0x3C: // <
+                    checkAttribute();
+                    unread(b);
+                    return false;
+                default:
+                    // omitting uppercasing
+                    if (metaState == MetaState.A) {
+                        attributeValue.append((char) b);
+                    }
+                    break;
+            }
+        }
+    }
+
+    private void checkAttribute() throws SAXException, StopSniffingException {
+        if (metaState == MetaState.A) {
+            String name = attributeName.toString();
+            if ("charset".equals(name)) {
+                // XXX revisit trim() to trime only space characters
+                tryCharset(attributeValue.toString().trim());
+            } else if ("content".equals(name)) {
+                Matcher m = CONTENT.matcher(attributeValue);
+                if (m.matches()) {
+                    String value = null;
+                    for (int i = 1; i < 4; i++) {
+                        value = m.group(i);
+                        if (value != null) {
+                            tryCharset(value);
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    private void tryCharset(String encoding) throws SAXException, StopSniffingException {
+        encoding = encoding.toUpperCase();
+        try {
+            // XXX deviating from the spec as per mjs on IRC.
+            if ("UTF-16".equals(encoding) || "UTF-16BE".equals(encoding) || "UTF-16LE".equals(encoding) || "UTF-32".equals(encoding) || "UTF-32BE".equals(encoding) || "UTF-32LE".equals(encoding)) {
+                this.charsetDecoder = Charset.forName("UTF-8").newDecoder();
+                err("The internal character encoding declaration specified \u201C" + encoding + "\u201D which is not a rough superset of ASCII. Using \u201CUTF-8\u201D instead.");
+                throw new StopSniffingException();
+            } else {
+                Charset cs = Charset.forName(encoding);
+                String canonName = cs.name();
+                if (!EncodingInfo.isAsciiSuperset(canonName)) {
+                    err("The encoding \u201C"
+                                + encoding
+                                + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration. Continuing the sniffing algorithm.");
+                    return;
+                }
+                if (canonName.startsWith("X-") || canonName.startsWith("x-")
+                        || canonName.startsWith("Mac")) {
+                    if (encoding.startsWith("X-")) {
+                        err("The encoding \u201C" + encoding
+                                + "\u201D is not an IANA-registered encoding. (Charmod C022)");
+                    } else {
+                        err("The encoding \u201C" + encoding
+                                + "\u201D is not an IANA-registered encoding and did\u2019t start with \u201CX-\u201D. (Charmod C023)");
+                    }
+                } else if (!canonName.equalsIgnoreCase(encoding)) {
+                    err("The encoding \u201C" + encoding
+                            + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C"
+                            + canonName + "\u201D. (Charmod C024)");
+                }
+                if (EncodingInfo.isObscure(canonName)) {
+                    warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D.");
+                }
+                this.charsetDecoder = cs.newDecoder();
+                throw new StopSniffingException();
+            }
+        } catch (IllegalCharsetNameException e) {
+            err("Illegal character encoding name: \u201C" + encoding + "\u201D. Will continue sniffing.");
+        } catch (UnsupportedCharsetException e) {
+            err("Unsupported character encoding name: \u201C" + encoding + "\u201D. Will continue sniffing.");
+        }
+    }
+
+    /**
+     * @param string
+     * @throws SAXException
+     */
+    private void err(String message) throws SAXException {
+        if (errorHandler != null) {
+          SAXParseException spe = new SAXParseException(message, this);
+          errorHandler.error(spe);
+        }
+    }
+
+    /**
+     * @param string
+     * @throws SAXException
+     */
+    private void warn(String message) throws SAXException {
+        if (errorHandler != null) {
+          SAXParseException spe = new SAXParseException(message, this);
+          errorHandler.warning(spe);
+        }
+    }
+    
+    private void quotedAttribute(int delim) throws SAXException, StopSniffingException, IOException {
+        int b;
+        for (;;) {
+            b = read();
+            if (b == delim) {
+                checkAttribute();
+                return;
+            } else {
+                if (metaState == MetaState.A) {
+                    attributeValue.append((char) b);
+                }
+            }
+        }
+    }
+
+    private void consumeUntilAndIncludingGt() throws IOException, StopSniffingException {
+        for (;;) {
+            if (read() == 0x3E) { // >
+                return;
+            }
+        }
+    }
+
+    /**
+     * Seen < , /
+     * 
+     * @throws SAXException
+     * @throws StopSniffingException 
+     * @throws IOException 
+     */
+    private void endTag() throws SAXException, StopSniffingException, IOException {
+        int b = read();
+        if ((b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A)) { // ASCII
+            // letter
+            metaState = MetaState.NO;
+            tag();
+        } else {
+            consumeUntilAndIncludingGt();
+        }
+    }
+
+    /**
+     * Seen < , !
+     * @throws IOException 
+     * @throws StopSniffingException 
+     */
+    private void markupDecl() throws IOException, StopSniffingException {
+        if (read() == 0x2D) { // -
+            comment();
+        } else {
+            consumeUntilAndIncludingGt();
+        }
+    }
+
+    /**
+     * Seen < , ! , -
+     * @throws IOException 
+     * @throws StopSniffingException 
+     */
+    private void comment() throws IOException, StopSniffingException {
+        if (read() == 0x2D) { // -
+            int hyphensSeen = 2;
+            for (;;) {
+                int b = read();
+                if (b == 0x2D) { // -
+                    hyphensSeen++;
+                } else if (b == 0x3E) { // >
+                    if (hyphensSeen >= 2) {
+                        return;
+                    } else {
+                        hyphensSeen = 0;
+                    }
+                } else {
+                    hyphensSeen = 0;
+                }
+            }
+        } else {
+            consumeUntilAndIncludingGt();
+        }
+    }
+
+    public int getColumnNumber() {
+        return col;
+    }
+
+    public int getLineNumber() {
+        return line;
+    }
+
+    public String getPublicId() {
+        if (locator != null) {
+            return locator.getPublicId();
+        }
+        return null;
+    }
+
+    public String getSystemId() {
+        if (locator != null) {
+            return locator.getSystemId();
+        }
+        return null;
+    }
+
+}

Added: incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/TokenHandler.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/TokenHandler.java?rev=587550&view=auto
==============================================================================
--- incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/TokenHandler.java (added)
+++ incubator/abdera/java/trunk/extensions/json/src/main/java/nu/validator/htmlparser/impl/TokenHandler.java Tue Oct 23 09:28:51 2007
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2007 Henri Sivonen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.htmlparser.impl;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+/**
+ * <code>Tokenizer</code> reports tokens through this interface.
+ * 
+ * @version $Id: TokenHandler.java 150 2007-08-16 19:21:25Z hsivonen $
+ * @author hsivonen
+ */
+public interface TokenHandler {
+
+    /**
+     * This method is called at the start of tokenization before any other 
+     * methods on this interface are called. Implementations should hold 
+     * the reference to the <code>Tokenizer</code> in order to set the 
+     * content model flag and in order to be able to query for 
+     * <code>Locator</code> data.
+     * 
+     * @param self the <code>Tokenizer</code>.
+     * @throws SAXException if something went wrong
+     */
+    public void start(Tokenizer self) throws SAXException;
+
+    /**
+     * If this handler implementation cares about comments, return <code>true</code>.
+     * If not, return <code>false</code>.
+     * 
+     * @return whether this handler wants comments
+     * @throws SAXException if something went wrong
+     */
+    public boolean wantsComments() throws SAXException;
+    
+    /**
+     * Receive a doctype token.
+     * 
+     * @param name the name
+     * @param publicIdentifier the public id
+     * @param systemIdentifier the system id
+     * @param correct whether the token is correct
+     * @throws SAXException if something went wrong
+     */
+    public void doctype(String name, String publicIdentifier, String systemIdentifier, boolean correct) throws SAXException;
+
+    /**
+     * Receive a start tag token.
+     * 
+     * @param name the tag name
+     * @param attributes the attributes
+     * @throws SAXException if something went wrong
+     */
+    public void startTag(String name, Attributes attributes) throws SAXException;
+    
+    /**
+     * Receive an end tag token.
+     * 
+     * @param name the tag name
+     * @param attributes the attributes
+     * @throws SAXException if something went wrong
+     */
+    public void endTag(String name, Attributes attributes) throws SAXException;
+    
+    /**
+     * Receive a comment token. The data is junk if the <code>wantsComments()</code> 
+     * returned <code>false</code>.
+     * 
+     * @param buf a buffer holding the data
+     * @param length the number of code units to read
+     * @throws SAXException if something went wrong
+     */
+    public void comment(char[] buf, int length) throws SAXException;
+    
+    /**
+     * Receive character tokens. This method has the same semantics as 
+     * the SAX method of the same name.
+     * 
+     * @param buf a buffer holding the data
+     * @param start offset into the buffer
+     * @param length the number of code units to read
+     * @throws SAXException if something went wrong
+     * @see org.xml.sax.ContentHandler#characters(char[], int, int)
+     */
+    public void characters(char[] buf, int start, int length) throws SAXException;
+    
+    /**
+     * The end-of-file token.
+     * 
+     * @throws SAXException if something went wrong
+     */
+    public void eof() throws SAXException;
+    
+}