You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@shindig.apache.org by lr...@apache.org on 2008/11/03 22:41:25 UTC

svn commit: r710176 - in /incubator/shindig/trunk/java/gadgets/src: main/java/org/apache/shindig/gadgets/parse/ main/java/org/apache/shindig/gadgets/parse/caja/ main/java/org/apache/shindig/gadgets/parse/nekohtml/ main/java/org/apache/shindig/gadgets/r...

Author: lryan
Date: Mon Nov  3 13:41:23 2008
New Revision: 710176

URL: http://svn.apache.org/viewvc?rev=710176&view=rev
Log:
Introduce light-weight Neko parser with a simplified DOM
Abstracted serialization mechanism to provider for parser specific artifacts
Added rewriter benchmarks
Various fixes to existing DOM based content rewriters

Added:
    incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/HtmlSerializer.java
    incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoSimplifiedHtmlParser.java
    incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/test.html
    incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/nekohtml/
    incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/nekohtml/NekoSimplifiedHtmlParserTest.java
    incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/LexerVsDomRewriteBenchmark.java
Modified:
    incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/ParseModule.java
    incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/caja/CajaHtmlParser.java
    incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoHtmlParser.java
    incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/HtmlContentRewriter.java
    incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/JsTagConcatContentRewriter.java
    incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/LinkingTagContentRewriter.java
    incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/MutableContent.java
    incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/StyleLinksContentRewriter.java
    incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/HtmlParserTest.java
    incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/ParseTreeSerializerBenchmark.java
    incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/JsTagConcatContentRewriterTest.java
    incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/LinkingTagContentRewriterTest.java
    incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/StyleLinksContentRewriterTest.java

Added: incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/HtmlSerializer.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/HtmlSerializer.java?rev=710176&view=auto
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/HtmlSerializer.java (added)
+++ incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/HtmlSerializer.java Mon Nov  3 13:41:23 2008
@@ -0,0 +1,81 @@
+package org.apache.shindig.gadgets.parse;
+
+import org.w3c.dom.Document;
+
+import java.io.StringWriter;
+
+/**
+ * Serialize a w3c document. An implementation of this interface should be bound
+ * to the document produced by an implementor of HtmlParser and retrieveable via
+ * document.getUserData(HtmlSerializer.KEY)
+ */
+public abstract class HtmlSerializer {
+
+  /**
+   * Used to key an instance of HtmlSerializer in
+   * document.getUserData
+   */
+  private static final String KEY = "serializer";
+
+  /**
+   * Used by a parser to record the original length of the content it parsed
+   * Can be used to optimize output buffers
+   */
+  private static final String ORIGINAL_LENGTH = "original-length";
+
+  /**
+   * Attach a serializer instance to the document
+   * @param doc
+   * @param serializer
+   * @param originalContent may be null
+   */
+  public static void attach(Document doc, HtmlSerializer serializer, String originalContent) {
+    doc.setUserData(KEY, serializer, null);
+    if (originalContent != null) {
+      doc.setUserData(ORIGINAL_LENGTH, originalContent.length(), null);
+    }
+  }
+
+  /**
+   * Get the length of the original version of the document
+   * @param doc
+   * @return
+   */
+  protected static int getOriginalLength(Document doc) {
+    Integer length = (Integer)doc.getUserData(ORIGINAL_LENGTH);
+    if (length == null) return -1;
+    return length;
+  }
+
+  /**
+   * Create a writer sized to the original length of the document
+   * @param doc
+   * @return
+   */
+  protected static StringWriter createWriter(Document doc) {
+    int originalLength = getOriginalLength(doc);
+    if (originalLength == -1) {
+      return new StringWriter(8192);
+    } else {
+      // Typically rewriting makes a document larger
+      return new StringWriter((originalLength * 11) / 10);
+    }
+  }
+
+  /**
+   * Call the attached serializer and output the document
+   * @param doc
+   * @return
+   */
+  public static String serialize(Document doc) {
+    return ((HtmlSerializer)doc.getUserData(KEY)).serializeImpl(doc);
+  }
+
+  /**
+   * Overridden by implementations
+   * @param doc
+   * @return
+   */
+  protected abstract String serializeImpl(Document doc);
+
+}

Modified: incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/ParseModule.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/ParseModule.java?rev=710176&r1=710175&r2=710176&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/ParseModule.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/ParseModule.java Mon Nov  3 13:41:23 2008
@@ -17,12 +17,11 @@
  */
 package org.apache.shindig.gadgets.parse;
 
-import org.apache.shindig.gadgets.parse.caja.CajaHtmlParser;
-
 import com.google.inject.AbstractModule;
 import com.google.inject.Provider;
-
-import org.w3c.dom.html.HTMLDocument;
+import org.apache.shindig.gadgets.parse.caja.CajaHtmlParser;
+import org.w3c.dom.DOMImplementation;
+import org.w3c.dom.bootstrap.DOMImplementationRegistry;
 
 /**
  * Provide parse bindings
@@ -36,36 +35,47 @@
   protected void configure() {
     //bind(GadgetHtmlParser.class).to(NekoHtmlParser.class);
     bind(GadgetHtmlParser.class).to(CajaHtmlParser.class);
-    bind(HTMLDocument.class).toProvider(HTMLDocumentProvider.class);
+    bind(DOMImplementation.class).toProvider(DOMImplementationProvider.class);
   }
 
   /**
    * Provider of new HTMLDocument implementations. Used to hide XML parser weirdness
    */
-  public static class HTMLDocumentProvider implements Provider<HTMLDocument> {
+  public static class DOMImplementationProvider implements Provider<DOMImplementation> {
 
-    Class htmlDocImpl;
+    DOMImplementation domImpl;
 
-    public HTMLDocumentProvider() {
+    public DOMImplementationProvider() {
+      try {
+        DOMImplementationRegistry registry = DOMImplementationRegistry.newInstance();
+        // Require the traversal API
+        domImpl = registry.getDOMImplementation("XML 1.0 Traversal 2.0");
+      } catch (Exception e) {
+        // Try another
+      }
       // This is ugly but effective
       try {
-        htmlDocImpl = Class.forName("org.apache.html.dom.HTMLDocumentImpl");
-      } catch (ClassNotFoundException cnfe) {
-        try {
-          htmlDocImpl = Class.forName("com.sun.org.apache.html.internal.dom.HTMLDocumentImpl");
-        } catch (ClassNotFoundException cnfe2) {
-          throw new RuntimeException("Could not find HTML DOM implementation", cnfe2);
+        if (domImpl == null) {
+          domImpl = (DOMImplementation)
+              Class.forName("org.apache.xerces.internal.dom.DOMImplementationImpl").
+                  getMethod("getDOMImplementation").invoke(null);
         }
+      } catch (Exception ex) {
+        //try another
       }
-    }
-
-    public HTMLDocument get() {
       try {
-        return (HTMLDocument) htmlDocImpl.newInstance();
-      } catch (Exception e) {
-        throw new RuntimeException("Could not create HTML DOM from class "
-            + htmlDocImpl.getName(), e);
+        if (domImpl == null) {
+        domImpl = (DOMImplementation)
+          Class.forName("com.sun.org.apache.xerces.internal.dom.DOMImplementationImpl").
+              getMethod("getDOMImplementation").invoke(null);
+        }
+      } catch (Exception ex) {
+        throw new RuntimeException("Could not find HTML DOM implementation", ex);
       }
     }
+
+    public DOMImplementation get() {
+      return domImpl;
+    }
   }
 }

Modified: incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/caja/CajaHtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/caja/CajaHtmlParser.java?rev=710176&r1=710175&r2=710176&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/caja/CajaHtmlParser.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/caja/CajaHtmlParser.java Mon Nov  3 13:41:23 2008
@@ -23,17 +23,17 @@
 import com.google.caja.reporting.MessageQueue;
 import com.google.caja.reporting.SimpleMessageQueue;
 import com.google.inject.Inject;
-import com.google.inject.Provider;
 import com.google.inject.Singleton;
 import org.apache.shindig.gadgets.GadgetException;
 import org.apache.shindig.gadgets.parse.GadgetHtmlParser;
-import org.w3c.dom.Attr;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-import org.w3c.dom.html.HTMLDocument;
+import org.apache.shindig.gadgets.parse.HtmlSerializer;
+import org.apache.xml.serialize.HTMLSerializer;
+import org.apache.xml.serialize.OutputFormat;
+import org.w3c.dom.*;
 
+import java.io.IOException;
 import java.io.StringReader;
+import java.io.StringWriter;
 import java.net.URI;
 import java.net.URISyntaxException;
 
@@ -43,17 +43,19 @@
 @Singleton
 public class CajaHtmlParser extends GadgetHtmlParser {
 
-  Provider<HTMLDocument> documentProvider;
+  private final DOMImplementation documentProvider;
 
   @Inject
-  public CajaHtmlParser(Provider<HTMLDocument> documentProvider) {
+  public CajaHtmlParser(DOMImplementation documentProvider) {
     this.documentProvider = documentProvider;
   }
 
   @Override
   public Document parseDom(String source) throws GadgetException {
     // Wrap the whole thing in a top-level node to get full contents.
-    return makeDocument(getFragment(source));
+    Document document = makeDocument(getFragment(source));
+    HtmlSerializer.attach(document, new Serializer(), source);
+    return document;
   }
 
   DomTree.Fragment getFragment(String content) throws GadgetException {
@@ -87,8 +89,8 @@
     return new DomParser(new TokenQueue<HtmlTokenType>(lexer, source), false, mQueue);
   }
 
-  private HTMLDocument makeDocument(DomTree.Fragment fragment) {
-    HTMLDocument htmlDocument = documentProvider.get();
+  private Document makeDocument(DomTree.Fragment fragment) {
+    Document htmlDocument = documentProvider.createDocument(null, null, null);
 
     // Check if doc contains an HTML node. If so just add it and recurse
     for (DomTree node : fragment.children()) {
@@ -105,7 +107,7 @@
     return htmlDocument;
   }
 
-  private static void recurseDocument(HTMLDocument doc, Node parent, DomTree elem) {
+  private static void recurseDocument(Document doc, Node parent, DomTree elem) {
     if (elem instanceof DomTree.Tag) {
       DomTree.Tag tag = (DomTree.Tag) elem;
       Element element = doc.createElement(tag.getTagName());
@@ -127,4 +129,24 @@
       // TODO Implement for comment, fragment etc...
     }
   }
+
+  static class Serializer extends HtmlSerializer {
+
+    static final OutputFormat outputFormat = new OutputFormat();
+    static {
+      outputFormat.setPreserveSpace(true);
+      outputFormat.setPreserveEmptyAttributes(false);
+    }
+
+    public String serializeImpl(Document doc) {
+      StringWriter sw = createWriter(doc);
+      HTMLSerializer serializer = new HTMLSerializer(sw, outputFormat);
+      try {
+        serializer.serialize(doc);
+        return sw.toString();
+      } catch (IOException ioe) {
+        return null;
+      }
+    }
+  }
 }

Modified: incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoHtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoHtmlParser.java?rev=710176&r1=710175&r2=710176&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoHtmlParser.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoHtmlParser.java Mon Nov  3 13:41:23 2008
@@ -18,20 +18,23 @@
 package org.apache.shindig.gadgets.parse.nekohtml;
 
 import com.google.inject.Inject;
-import com.google.inject.Provider;
 import org.apache.shindig.common.xml.XmlUtil;
 import org.apache.shindig.gadgets.GadgetException;
 import org.apache.shindig.gadgets.parse.GadgetHtmlParser;
+import org.apache.shindig.gadgets.parse.HtmlSerializer;
+import org.apache.xml.serialize.HTMLSerializer;
+import org.apache.xml.serialize.OutputFormat;
 import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.w3c.dom.DOMImplementation;
 import org.w3c.dom.Document;
 import org.w3c.dom.DocumentFragment;
 import org.w3c.dom.Node;
-import org.w3c.dom.html.HTMLDocument;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 
 import java.io.IOException;
 import java.io.StringReader;
+import java.io.StringWriter;
 
 /**
  * Parser that uses the NekoHtml parser.
@@ -43,17 +46,19 @@
  */
 public class NekoHtmlParser extends GadgetHtmlParser {
 
-  Provider<HTMLDocument> documentProvider;
+  private final DOMImplementation documentProvider;
 
   @Inject
-  public NekoHtmlParser(Provider<HTMLDocument> documentProvider) {
+  public NekoHtmlParser(DOMImplementation documentProvider) {
     this.documentProvider = documentProvider;
   }
 
   @Override
   public Document parseDom(String source) throws GadgetException {
     try {
-      return parseFragment(source);
+      Document document = parseFragment(source);
+      HtmlSerializer.attach(document, new Serializer(), source);
+      return document;
     } catch (Exception e) {
       throw new GadgetException(GadgetException.Code.HTML_PARSE_ERROR, e);
     }
@@ -63,7 +68,7 @@
     InputSource input = new InputSource(new StringReader(source));
     DOMFragmentParser parser = new DOMFragmentParser();
 
-    HTMLDocument htmlDoc = documentProvider.get();
+    Document htmlDoc = documentProvider.createDocument(null, null, null);
     DocumentFragment fragment = htmlDoc.createDocumentFragment();
     parser.parse(input, fragment);
     Node htmlNode = XmlUtil.getFirstNamedChildNode(fragment, "HTML");
@@ -75,4 +80,24 @@
     }
     return htmlDoc;
   }
+
+  static class Serializer extends HtmlSerializer {
+
+    static final OutputFormat outputFormat = new OutputFormat();
+    static {
+      outputFormat.setPreserveSpace(true);
+      outputFormat.setPreserveEmptyAttributes(false);
+    }
+
+    public String serializeImpl(Document doc) {
+      StringWriter sw = createWriter(doc);
+      HTMLSerializer serializer = new HTMLSerializer(sw, outputFormat);
+      try {
+        serializer.serialize(doc);
+        return sw.toString();
+      } catch (IOException ioe) {
+        return null;
+      }
+    }
+  }
 }

Added: incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoSimplifiedHtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoSimplifiedHtmlParser.java?rev=710176&view=auto
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoSimplifiedHtmlParser.java (added)
+++ incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/NekoSimplifiedHtmlParser.java Mon Nov  3 13:41:23 2008
@@ -0,0 +1,351 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+package org.apache.shindig.gadgets.parse.nekohtml;
+
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Lists;
+import com.google.inject.Inject;
+import org.apache.shindig.common.xml.XmlUtil;
+import org.apache.shindig.gadgets.parse.GadgetHtmlParser;
+import org.apache.shindig.gadgets.parse.HtmlSerializer;
+import org.apache.xerces.xni.*;
+import org.apache.xerces.xni.parser.XMLDocumentSource;
+import org.apache.xerces.xni.parser.XMLInputSource;
+import org.apache.xml.serialize.HTMLSerializer;
+import org.apache.xml.serialize.OutputFormat;
+import org.cyberneko.html.HTMLEventInfo;
+import org.cyberneko.html.HTMLScanner;
+import org.cyberneko.html.HTMLTagBalancer;
+import org.w3c.dom.*;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.io.StringWriter;
+import java.util.List;
+import java.util.Set;
+import java.util.Stack;
+
+/**
+ * Creates a greatly simplified DOM model that contains elements for only the specified
+ * element set and creates unescaped text nodes for all other content.
+ * It requires special serialization to prevent escaping of text nodes but behaves like a
+ * regular DOM in all other respects. Only element types which are produced are balanced.
+ */
+public class NekoSimplifiedHtmlParser extends GadgetHtmlParser {
+
+  private static final Set<String> elements =
+      ImmutableSet.of("html", "body", "head", "link", "img", "style", "script", "embed");
+
+  private final DOMImplementation documentFactory;
+
+  @Inject
+  public NekoSimplifiedHtmlParser(DOMImplementation documentFactory) {
+    this.documentFactory = documentFactory;
+  }
+
+  public Document parseDom(String source) {
+
+    HTMLScanner htmlScanner = new HTMLScanner();
+    HTMLTagBalancer tagBalancer = new HTMLTagBalancer();
+    DocumentHandler handler = new DocumentHandler(source);
+    tagBalancer.setDocumentHandler(handler);
+    htmlScanner.setDocumentHandler(tagBalancer);
+    tagBalancer.setFeature("http://cyberneko.org/html/features/augmentations", true);
+    htmlScanner.setFeature("http://cyberneko.org/html/features/augmentations", true);
+
+    XMLInputSource inputSource = new XMLInputSource(null, null, null);
+    inputSource.setEncoding("UTF-8");
+    inputSource.setCharacterStream(new StringReader(source));
+    try {
+      htmlScanner.setInputSource(inputSource);
+      htmlScanner.scanDocument(true);
+      Document document = handler.getDocument();
+      DocumentFragment fragment = handler.getFragment();
+      Node htmlNode = XmlUtil.getFirstNamedChildNode(fragment, "HTML");
+      if (htmlNode != null) {
+        document.appendChild(htmlNode);
+      } else {
+        Node root = document.appendChild(document.createElement("HTML"));
+        root.appendChild(fragment);
+      }
+      HtmlSerializer.attach(document, new Serializer(), source);
+      return document;
+    } catch (IOException ioe) {
+      return null;
+    }
+  }
+
+
+  /**
+   * Handler for XNI events from Neko
+   */
+  private class DocumentHandler implements XMLDocumentHandler {
+    private final List<Integer> lines;
+    private final Stack<Node> elementStack = new Stack<Node>();
+    private final int[] startCharOffsets;
+    private final int[] lastCharOffsets;
+    private DocumentFragment documentFragment;
+    private Document document;
+    private final String content;
+
+    public DocumentHandler(String content) {
+      this.content = content;
+      // Populate lines
+      lines = Lists.newArrayListWithExpectedSize(content.length() / 30);
+      lines.add(0);
+      for (int i = 0; i < content.length(); i++) {
+        char c = content.charAt(i);
+        if (c == '\n' || c == '\r') {
+          if (i + 1 < content.length() && (c == '\r' && content.charAt(i+1) == '\n')) {
+            i++;
+            lines.add(i);
+          } else {
+            lines.add(i);
+          }
+        }
+      }
+      startCharOffsets = new int[]{-1,-1};
+      lastCharOffsets = new int[]{-1,-1};
+    }
+
+    public DocumentFragment getFragment() {
+      return documentFragment;
+    }
+
+    public Document getDocument() {
+      return document;
+    }
+
+    private HTMLEventInfo getEventInfo(Augmentations augmentations) {
+      HTMLEventInfo htmlEventInfo =
+          (HTMLEventInfo) augmentations.getItem("http://cyberneko.org/html/features/augmentations");
+      return htmlEventInfo;
+    }
+
+    private String getUnstructuredString(int[] start, int[] end) {
+      if (start[0] == -1) return "";
+
+      int charStart = start[0];
+      int charEnd;
+      if (end[0] == -1) {
+        charEnd = start[1];
+      } else {
+        charEnd = end[1];
+      }
+      String s = content.substring(charStart, charEnd);
+      return s;
+    }
+
+    private void recordStartEnd(HTMLEventInfo info, int[] offsets) {
+      offsets[0] = lines.get(info.getBeginLineNumber() - 1) + info.getBeginColumnNumber() - 1;
+      offsets[1] = lines.get(info.getEndLineNumber() - 1) + info.getEndColumnNumber() - 1;
+    }
+
+    public void handleEvent(boolean shouldClose, Object content, Augmentations augs) {
+      HTMLEventInfo info = getEventInfo(augs);
+      if (info.isSynthesized()) {
+        // NOTE! Remove this to balance syntesized close tags
+        if (!shouldClose) return;
+        // Must close with existing content
+        String unstructured = getUnstructuredString(startCharOffsets, lastCharOffsets);
+        elementStack.peek().appendChild(document.createTextNode(unstructured));
+        startCharOffsets[0] = -1;
+        lastCharOffsets[0] = -1;
+        if (content != null) {
+          elementStack.peek().appendChild(document.createTextNode(content.toString()));
+        }
+      } else {
+        if (shouldClose) {
+          String unstructured = getUnstructuredString(startCharOffsets, lastCharOffsets);
+          elementStack.peek().appendChild(document.createTextNode(unstructured));
+          startCharOffsets[0] = -1;
+          lastCharOffsets[0] = -1;
+        } else if (startCharOffsets[0] == -1) {
+          recordStartEnd(info, startCharOffsets);
+          lastCharOffsets[0] = -1;
+        } else {
+          recordStartEnd(info, lastCharOffsets);
+        }
+      }
+    }
+
+    private void trace(String prefix, Augmentations augmentations) {
+      HTMLEventInfo info = getEventInfo(augmentations);
+      String text = "";
+      if (!info.isSynthesized()) {
+        int[] startEnd = new int[2];
+        recordStartEnd(info, startEnd);
+        text = content.substring(startEnd[0], startEnd[1]);
+        text = text.replaceAll("\n", "\\n");
+        text = text.replaceAll("\r", "\\r");
+      }
+      System.out.println("Event " + prefix + info.toString() + " -> " + text);
+    }
+
+    public void startDocument(XMLLocator xmlLocator, String encoding,
+                              NamespaceContext namespaceContext, Augmentations augs) throws XNIException {
+      document = documentFactory.createDocument(null, null, null);
+      elementStack.clear();
+      documentFragment = document.createDocumentFragment();
+      elementStack.push(documentFragment);
+      //trace("StartDoc", augs);
+    }
+
+    public void xmlDecl(String version, String encoding, String standalone, Augmentations augs) throws XNIException {
+      //trace("xmlDecl", augs);
+      handleEvent(false, null, augs);
+    }
+
+    public void doctypeDecl(String rootElement, String publicId, String systemId, Augmentations augs) throws XNIException {
+      // Recreate the document with the specific doctype
+      document = documentFactory.createDocument(null, null,
+          documentFactory.createDocumentType(rootElement, publicId, systemId));
+      elementStack.clear();
+      documentFragment = document.createDocumentFragment();
+      elementStack.push(documentFragment);
+      //trace("docTypeDecl", augs);
+      handleEvent(false, null, augs);
+    }
+
+    public void comment(XMLString xmlString, Augmentations augs) throws XNIException {
+      //trace("comment", augs);
+      handleEvent(false, xmlString, augs);
+      //trackInfo(augs);
+    }
+
+    public void processingInstruction(String s, XMLString xmlString, Augmentations augs) throws XNIException {
+      //trace("PI", augs);
+      handleEvent(false, xmlString, augs);
+    }
+
+    public void startElement(QName qName, XMLAttributes xmlAttributes, Augmentations augs) throws XNIException {
+      //trace("StartElem(" + qName.rawname + ")", augs);
+      if (elements.contains(qName.rawname.toLowerCase())) {
+        handleEvent(true, null, augs);
+        Element element = document.createElement(qName.rawname);
+        for (int i = 0; i < xmlAttributes.getLength(); i++) {
+          element.setAttribute(xmlAttributes.getLocalName(i) , xmlAttributes.getValue(i));
+        }
+        elementStack.peek().appendChild(element);
+        elementStack.push(element);
+      } else {
+        handleEvent(false, null, augs);
+      }
+    }
+
+    public void emptyElement(QName qName, XMLAttributes xmlAttributes, Augmentations augs) throws XNIException {
+      //trace("EmptyElemm(" + qName.rawname + ")", augs);
+      if (elements.contains(qName.rawname.toLowerCase())) {
+        handleEvent(true, null, augs);
+        Element element = document.createElement(qName.rawname);
+        for (int i = 0; i < xmlAttributes.getLength(); i++) {
+          element.setAttribute(xmlAttributes.getLocalName(i) , xmlAttributes.getValue(i));
+        }
+        elementStack.peek().appendChild(element);
+      } else {
+        handleEvent(false, null, augs);
+      }
+
+    }
+
+    public void startGeneralEntity(String s, XMLResourceIdentifier xmlResourceIdentifier, String s1, Augmentations augs) throws XNIException {
+      //trace("StartEntity(" + s + ")", augs);
+      handleEvent(false, null, augs);
+    }
+
+    public void textDecl(String s, String s1, Augmentations augs) throws XNIException {
+      //trace("Textdecl(" + s + ")", augs);
+      handleEvent(false, null, augs);
+    }
+
+    public void endGeneralEntity(String s, Augmentations augs) throws XNIException {
+      //trace("EndEntity(" + s + ")", augs);
+      handleEvent(false, null, augs);
+    }
+
+    public void characters(XMLString xmlString, Augmentations augs) throws XNIException {
+      handleEvent(false, xmlString, augs);
+    }
+
+    public void ignorableWhitespace(XMLString xmlString, Augmentations augs) throws XNIException {
+      //trace("Whitespace", augs);
+      handleEvent(false, xmlString, augs);
+      //trackInfo(augs);
+    }
+
+    public void endElement(QName qName, Augmentations augs) throws XNIException {
+      //trace("EndElem(" + qName.rawname + ")", augs);
+      if (elements.contains(qName.rawname.toLowerCase())) {
+        handleEvent(true, null, augs);
+        // FIXME - Balancer
+        elementStack.pop();
+      } else {
+        handleEvent(false, "</" + qName.rawname + ">", augs);
+      }
+    }
+
+    public void startCDATA(Augmentations augs) throws XNIException {
+      //trace("startCData", augs);
+      handleEvent(false, null, augs);
+    }
+
+    public void endCDATA(Augmentations augs) throws XNIException {
+      //trace("endCData", augs);
+      handleEvent(false, null, augs);
+    }
+
+    public void endDocument(Augmentations augs) throws XNIException {
+      //trace("endDoc", augs);
+      handleEvent(false, null, augs);
+    }
+
+    public void setDocumentSource(XMLDocumentSource xmlDocumentSource) {
+    }
+
+    public XMLDocumentSource getDocumentSource() {
+      return null;
+    }
+  }
+
+  static class Serializer extends HtmlSerializer {
+    
+    static final OutputFormat outputFormat = new OutputFormat();
+    static {
+      outputFormat.setPreserveSpace(true);
+      outputFormat.setPreserveEmptyAttributes(false);
+    }
+
+    public String serializeImpl(Document doc) {
+      StringWriter sw = createWriter(doc);
+      HTMLSerializer serializer = new HTMLSerializer(sw, outputFormat) {
+        // Overridden to prevent escaping of literal text
+        @Override
+        protected void characters(String s) throws IOException {
+          this.content();
+          this._printer.printText(s);
+        }
+      };
+      try {
+        serializer.serialize(doc);
+        return sw.toString();
+      } catch (IOException ioe) {
+        return null;
+      }
+    }
+  }
+}

Added: incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/test.html
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/test.html?rev=710176&view=auto
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/test.html (added)
+++ incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/nekohtml/test.html Mon Nov  3 13:41:23 2008
@@ -0,0 +1,28 @@
+<html><body><select><option>content<option></body></html>
+
+Balanced    
+Event StartDoc1:1:1:1 ->
+Event StartElem(html)1:1:1:7 -> <html>
+Event StartElem(body)1:7:1:13 -> <body>
+Event StartElem(select)1:13:1:21 -> <select>
+Event StartElem(option)1:21:1:29 -> <option>
+Event Chars(content)1:29:1:36 -> content
+Event EndElem(option)synthesized ->
+Event StartElem(option)1:36:1:44 -> <option>
+Event EndElem(option)synthesized ->
+Event EndElem(select)synthesized ->
+Event EndElem(body)synthesized ->
+Event EndElem(html)synthesized ->
+Event endDoc1:58:1:58 ->
+
+Unbalanced
+Event StartDoc1:1:1:1 ->
+Event StartElem(html)1:1:1:7 -> <html>
+Event StartElem(body)1:7:1:13 -> <body>
+Event StartElem(select)1:13:1:21 -> <select>
+Event StartElem(option)1:21:1:29 -> <option>
+Event Chars(content)1:29:1:36 -> content
+Event StartElem(option)1:36:1:44 -> <option>
+Event EndElem(body)1:44:1:51 -> </body>
+Event EndElem(html)1:51:1:58 -> </html>
+Event endDoc1:58:1:58 ->

Modified: incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/HtmlContentRewriter.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/HtmlContentRewriter.java?rev=710176&r1=710175&r2=710176&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/HtmlContentRewriter.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/HtmlContentRewriter.java Mon Nov  3 13:41:23 2008
@@ -18,14 +18,21 @@
  */
 package org.apache.shindig.gadgets.rewrite;
 
+import com.google.common.collect.Lists;
 import org.apache.shindig.common.uri.Uri;
 import org.apache.shindig.gadgets.Gadget;
 import org.apache.shindig.gadgets.http.HttpRequest;
 import org.apache.shindig.gadgets.http.HttpResponse;
 import org.apache.shindig.gadgets.spec.View;
 import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.traversal.DocumentTraversal;
+import org.w3c.dom.traversal.NodeFilter;
+import org.w3c.dom.traversal.NodeIterator;
 
 import java.net.URI;
+import java.util.List;
+import java.util.Set;
 
 /**
  * Simple helper base class for ContentRewriters that manipulate an
@@ -70,4 +77,22 @@
     return null;
   }
 
+  public static List<Node> getElementsByTagNameCaseInsensitive(Document doc,
+      final Set<String> lowerCaseNames) {
+    final List<Node> result = Lists.newArrayList();
+    NodeIterator nodeIterator = ((DocumentTraversal) doc)
+        .createNodeIterator(doc, NodeFilter.SHOW_ELEMENT,
+            new NodeFilter() {
+              public short acceptNode(Node n) {
+                if (lowerCaseNames.contains(n.getNodeName().toLowerCase())) {
+                  return NodeFilter.FILTER_ACCEPT;
+                }
+                return NodeFilter.FILTER_REJECT;
+              }
+            }, false);
+    for (Node n = nodeIterator.nextNode(); n != null ; n = nodeIterator.nextNode()) {
+      result.add(n);
+    }
+    return result;
+  }
 }

Modified: incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/JsTagConcatContentRewriter.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/JsTagConcatContentRewriter.java?rev=710176&r1=710175&r2=710176&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/JsTagConcatContentRewriter.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/JsTagConcatContentRewriter.java Mon Nov  3 13:41:23 2008
@@ -19,6 +19,7 @@
 package org.apache.shindig.gadgets.rewrite;
 
 import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
 import org.apache.shindig.common.uri.Uri;
 import org.apache.shindig.common.util.Utf8UrlCoder;
 import org.apache.shindig.gadgets.Gadget;
@@ -28,11 +29,11 @@
 import org.apache.shindig.gadgets.spec.GadgetSpec;
 import org.apache.shindig.gadgets.spec.View;
 import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
 
 import java.io.UnsupportedEncodingException;
 import java.net.URLEncoder;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.List;
 
@@ -43,6 +44,7 @@
   private final String concatUrlBase;
 
   private static final String DEFAULT_CONCAT_URL_BASE = "/gadgets/concat?";
+  private static final HashSet<String> TAG_NAMES = Sets.newHashSet("script");
 
   public JsTagConcatContentRewriter(ContentRewriterFeature.Factory rewriterFeatureFactory,
       String concatUrlBase) {
@@ -68,14 +70,9 @@
     }
 
     // Get all the script tags
-    NodeList scriptTags = content.getDocument().getElementsByTagName("SCRIPT");
+    List<Node> nodeList =
+        HtmlContentRewriter.getElementsByTagNameCaseInsensitive(content.getDocument(), TAG_NAMES);
 
-    // Copy NodeList as it respects changes to the underlying document which is a
-    // behavior we dont want when removing nodes
-    List<Node> nodeList = Lists.newArrayListWithExpectedSize(scriptTags.getLength());
-    for (int i = 0; i < scriptTags.getLength(); i++) {
-      nodeList.add(scriptTags.item(i));
-    }
 
     String concatBase = getJsConcatBase(gadget.getSpec(), rewriterFeature);
     Uri contentBase = gadget.getSpec().getUrl();

Modified: incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/LinkingTagContentRewriter.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/LinkingTagContentRewriter.java?rev=710176&r1=710175&r2=710176&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/LinkingTagContentRewriter.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/LinkingTagContentRewriter.java Mon Nov  3 13:41:23 2008
@@ -55,7 +55,7 @@
           .createNodeIterator(root, NodeFilter.SHOW_ELEMENT,
               new NodeFilter() {
                 public short acceptNode(Node n) {
-                  Set<String> stringSet = tagAttributeTargets.get(n.getNodeName());
+                  Set<String> stringSet = tagAttributeTargets.get(n.getNodeName().toUpperCase());
                   if (stringSet != null) {
                     NamedNodeMap attributes = n.getAttributes();
                     // TODO - Check is NodeMap lookup is case insensitive, if so use that

Modified: incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/MutableContent.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/MutableContent.java?rev=710176&r1=710175&r2=710176&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/MutableContent.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/MutableContent.java Mon Nov  3 13:41:23 2008
@@ -19,13 +19,9 @@
 
 import org.apache.shindig.gadgets.GadgetException;
 import org.apache.shindig.gadgets.parse.GadgetHtmlParser;
-import org.apache.xml.serialize.HTMLSerializer;
-import org.apache.xml.serialize.OutputFormat;
+import org.apache.shindig.gadgets.parse.HtmlSerializer;
 import org.w3c.dom.Document;
 
-import java.io.IOException;
-import java.io.StringWriter;
-
 /**
  * Object that maintains a String representation of arbitrary contents
  * and a consistent view of those contents as an HTML parse tree.
@@ -70,14 +66,9 @@
       // per rendering cycle: all rewriters (or other manipulators)
       // operating on the parse tree should happen together.
       contentParseId = parseEditId;
-      StringWriter sw = new StringWriter((content.length() * 10) / 9);
 
-      try {
-        new HTMLSerializer(sw, new OutputFormat(document)).serialize(document);
-      } catch (IOException e) {
-        // Never happens.
-      }
-      content = sw.toString();
+      // Parser will have bound an HTML serializer to the document
+      content = HtmlSerializer.serialize(document);
     }
     return content;
   }

Modified: incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/StyleLinksContentRewriter.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/StyleLinksContentRewriter.java?rev=710176&r1=710175&r2=710176&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/StyleLinksContentRewriter.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/rewrite/StyleLinksContentRewriter.java Mon Nov  3 13:41:23 2008
@@ -18,16 +18,18 @@
  */
 package org.apache.shindig.gadgets.rewrite;
 
+import com.google.common.collect.Sets;
 import org.apache.shindig.common.uri.Uri;
+import org.apache.shindig.common.xml.XmlUtil;
 import org.apache.shindig.gadgets.Gadget;
 import org.apache.shindig.gadgets.http.HttpRequest;
 import org.apache.shindig.gadgets.http.HttpResponse;
 import org.apache.shindig.gadgets.spec.View;
 import org.w3c.dom.Document;
 import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
 
 import java.net.URI;
+import java.util.List;
 
 public class StyleLinksContentRewriter implements ContentRewriter {
   // TODO: consider providing helper base class for node-visitor content rewriters
@@ -74,20 +76,18 @@
     }
     boolean mutated = false;
 
-    Node head;
-    NodeList headTags = doc.getElementsByTagName("HEAD");
-    if (headTags.getLength() == 0) {
+    // TODO This should move into parsers
+    Node head = XmlUtil.getFirstNamedChildNode(doc.getDocumentElement(), "head");
+    if (head == null) {
       mutated = true;
-      head = doc.getDocumentElement().appendChild(doc.createElement("HEAD"));
-    } else {
-      head = headTags.item(0);
+      head = doc.getDocumentElement().appendChild(doc.createElement("head"));
     }
 
     // Move all style tags into head
     // TODO Convert all @imports into a concatenated link tag
-    NodeList styleTags = doc.getElementsByTagName("STYLE");
-    for (int i = 0; i < styleTags.getLength(); i++) {
-      Node styleNode = styleTags.item(i);
+    List<Node> styleTags = HtmlContentRewriter.getElementsByTagNameCaseInsensitive(doc,
+        Sets.newHashSet("style"));
+    for (Node styleNode : styleTags) {      
       mutated = true;
       if (!styleNode.getParentNode().getNodeName().equalsIgnoreCase("HEAD")) {
         styleNode.getParentNode().removeChild(styleNode);

Modified: incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/HtmlParserTest.java?rev=710176&r1=710175&r2=710176&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/HtmlParserTest.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/HtmlParserTest.java Mon Nov  3 13:41:23 2008
@@ -31,10 +31,10 @@
 public class HtmlParserTest extends TestCase {
 
   private final GadgetHtmlParser cajaParser = new CajaHtmlParser(
-      new ParseModule.HTMLDocumentProvider());
+      new ParseModule.DOMImplementationProvider().get());
 
   private final GadgetHtmlParser nekoParser = new NekoHtmlParser(
-      new ParseModule.HTMLDocumentProvider());
+      new ParseModule.DOMImplementationProvider().get());
 
   public void testParseSimpleString() throws Exception {
     parseSimpleString(cajaParser);
@@ -57,7 +57,7 @@
     parseTagWithStringContents(cajaParser);
   }
 
-  public void parseTagWithStringContents(GadgetHtmlParser htmlParser) throws Exception {
+  void parseTagWithStringContents(GadgetHtmlParser htmlParser) throws Exception {
     Document doc = htmlParser.parseDom("<span>content</span>");
 
     Node node = doc.getDocumentElement().getFirstChild();

Modified: incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/ParseTreeSerializerBenchmark.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/ParseTreeSerializerBenchmark.java?rev=710176&r1=710175&r2=710176&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/ParseTreeSerializerBenchmark.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/ParseTreeSerializerBenchmark.java Mon Nov  3 13:41:23 2008
@@ -21,28 +21,40 @@
 import org.apache.shindig.gadgets.GadgetException;
 import org.apache.shindig.gadgets.parse.caja.CajaHtmlParser;
 import org.apache.shindig.gadgets.parse.nekohtml.NekoHtmlParser;
-import org.apache.xml.serialize.HTMLSerializer;
-import org.cyberneko.html.parsers.SAXParser;
-import org.w3c.dom.Node;
+import org.apache.shindig.gadgets.parse.nekohtml.NekoSimplifiedHtmlParser;
+import org.w3c.dom.DOMImplementation;
 import org.w3c.dom.bootstrap.DOMImplementationRegistry;
-import org.w3c.dom.ls.*;
 
-import java.io.*;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.StringWriter;
 
 /**
  * Benchmarks for HTML parsing and serialization
- *
- * NOTE - Uncomment DOM4J bits to test that.
  */
 public class ParseTreeSerializerBenchmark {
   private DOMImplementationRegistry registry = DOMImplementationRegistry.newInstance();
   private int numRuns;
   private String content;
-  private GadgetHtmlParser cajaParser = new CajaHtmlParser(new ParseModule.HTMLDocumentProvider());
-  private GadgetHtmlParser nekoParser = new NekoHtmlParser(new ParseModule.HTMLDocumentProvider());
+
+  private GadgetHtmlParser cajaParser = new CajaHtmlParser(
+      DOCUMENT_PROVIDER);
+
+  private GadgetHtmlParser nekoParser = new NekoHtmlParser(
+      DOCUMENT_PROVIDER);
+
+  private GadgetHtmlParser nekoSimpleParser = new NekoSimplifiedHtmlParser(
+      DOCUMENT_PROVIDER);
+  
   private boolean warmup;
-  private SAXParser saxParser;
-  //private SAXReader saxReader;
+
+  private static final DOMImplementation DOCUMENT_PROVIDER =
+      new ParseModule.DOMImplementationProvider().get();
 
   private ParseTreeSerializerBenchmark(String file, int numRuns) throws Exception {
     File inputFile = new File(file);
@@ -52,23 +64,18 @@
     }
     content = new String(IOUtils.toByteArray(new FileInputStream(file)));
 
-    saxParser = new SAXParser();
-    //saxParser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-comment-delims",true);
-    saxParser.setFeature("http://cyberneko.org/html/features/scanner/notify-builtin-refs",true);
-    //saxReader = new SAXReader(saxParser);
-    //saxReader.setValidation(false);
-
-    this.numRuns = 50;
+    this.numRuns = 10;
     warmup = true;
-    runCaja();
+    //runCaja();
     runNeko();
-    runLS();
+    runNekoSimple();
+    //Sleep to let JIT kick in
     Thread.sleep(10000L);
-    this.numRuns = 300; //numRuns;
+    this.numRuns = 50; //numRuns;
     warmup = false;
-    runCaja();
+    //runCaja();
     runNeko();
-    runLS();
+    runNekoSimple();
   }
 
   private void runCaja() throws Exception {
@@ -76,19 +83,19 @@
     // Some warmup runs with wait. Enough iterations to trigger the JIT
     // Wait to allow it to swap execution paths etc...
     timeParseDom(cajaParser);
+    timeParseDomSerialize(cajaParser);
   }
 
   private void runNeko() throws Exception {
     output("Neko-----------------");
     timeParseDom(nekoParser);
-    //timeParseDom4J();
-    //timeParseDom4JSerialize();
     timeParseDomSerialize(nekoParser);
   }
 
-  private void runLS() throws Exception {
-    output("LOAD/STORE-----------------");
-    runLSSerializationTiming(nekoParser);
+  private void runNekoSimple() throws Exception {
+    output("NekoSimple-----------------");
+    timeParseDom(nekoSimpleParser);
+    timeParseDomSerialize(nekoSimpleParser);
   }
 
   private void output(String string) {
@@ -108,119 +115,42 @@
           ((double)parseMillis)/numRuns + "ms/run]");
   }
 
-  /*
-  private void timeParseDom4J() throws GadgetException {
+  private void timeParseDomSerialize(GadgetHtmlParser parser) throws GadgetException {
+    org.w3c.dom.Document document = parser.parseDom(content);
     try {
       long parseStart = System.currentTimeMillis();
       for (int i = 0; i < numRuns; ++i) {
-         saxReader.read(new InputSource(new StringReader(content)));
+        HtmlSerializer.serialize(document);
       }
       long parseMillis = System.currentTimeMillis() - parseStart;
 
-      output("Parsing DOM4J [" + parseMillis + " ms total: " +
-            ((double)parseMillis)/numRuns + "ms/run]");
+      output("Serializing [" + parseMillis + " ms total: " +
+            ((double) parseMillis) / numRuns + "ms/run]");
     } catch (Exception e) {
       throw new GadgetException(GadgetException.Code.HTML_PARSE_ERROR, e);
     }
-  }
-  */
 
-  /*
-  private void timeParseDom4JSerialize() throws GadgetException {
     try {
-      Document document =  saxReader.read(new InputSource(new StringReader(content)));
-      OutputFormat format = OutputFormat.createCompactFormat();
-      format.setXHTML(false);
+      // Create an "identity" transformer - copies input to output
+      Transformer t = TransformerFactory.newInstance().newTransformer();
+      t.setOutputProperty(OutputKeys.METHOD, "html");
 
       long parseStart = System.currentTimeMillis();
       for (int i = 0; i < numRuns; ++i) {
         StringWriter sw = new StringWriter((content.length() * 11) / 10);
-        HTMLWriter htmlWriter = new HTMLWriter(sw, format) {
-          protected void writeEntity(Entity entity) throws IOException {
-            writer.write("&");
-            writer.write(entity.getName());
-            writer.write(";");
-            lastOutputNodeType = org.dom4j.Node.ENTITY_REFERENCE_NODE;
-          }
-        };
-        //htmlWriter.setResolveEntityRefs(false);
-        htmlWriter.setEscapeText(false);
-        htmlWriter.write(document);
+        t.transform(new DOMSource(document), new StreamResult(sw));
+        sw.toString();
       }
       long parseMillis = System.currentTimeMillis() - parseStart;
 
-      output("Serializing DOM4J [" + parseMillis + " ms total: " +
-            ((double)parseMillis)/numRuns + "ms/run]");
-    } catch (Exception e) {
-      throw new GadgetException(GadgetException.Code.HTML_PARSE_ERROR, e);
-    }
-
-  }
-  */
-
-  private void timeParseDomSerialize(GadgetHtmlParser parser) throws GadgetException {
-    org.w3c.dom.Document document = parser.parseDom(content);
-
-    try {
-      long parseStart = System.currentTimeMillis();
-      for (int i = 0; i < numRuns; ++i) {
-        StringWriter sw = new StringWriter((content.length() * 11) / 10);
-        HTMLSerializer xercesSerializer = new HTMLSerializer(sw, new org.apache.xml.serialize.OutputFormat());
-        xercesSerializer.serialize(document);
-      }
-      long parseMillis = System.currentTimeMillis() - parseStart;
-
-      output("Serializing Xerces [" + parseMillis + " ms total: " +
+      output("Serializing DOM Transformer [" + parseMillis + " ms total: " +
             ((double) parseMillis) / numRuns + "ms/run]");
+      
     } catch (Exception e) {
       throw new GadgetException(GadgetException.Code.HTML_PARSE_ERROR, e);
     }
   }
 
-  /*
-  private void timeParseOld(GadgetHtmlParser parser) throws GadgetException {
-    long parseStart = System.currentTimeMillis();
-    for (int i = 0; i < numRuns; ++i) {
-      parser.parse(content);
-    }
-    long parseMillis = System.currentTimeMillis() - parseStart;
-
-    output("Parsing [" + parseMillis + " ms total: " +
-          ((double)parseMillis)/numRuns + "ms/run]");
-  }
-  */
-
-  private void runLSSerializationTiming(GadgetHtmlParser parser) throws Exception {
-    Node n = parser.parseDom(content);
-    DOMImplementationLS impl = (DOMImplementationLS) registry.getDOMImplementation("LS");
-    ByteArrayOutputStream baos;
-    baos = new ByteArrayOutputStream(content.length() * 2);
-    LSSerializer writer = impl.createLSSerializer();
-    LSParser lsParser = impl.createLSParser(LSParser.ACTION_APPEND_AS_CHILDREN, null);
-
-    long serTime = 0, deserTime = 0;
-    for (int i = 0; i < numRuns; ++i) {
-      long serStart = System.currentTimeMillis();
-      LSOutput output = impl.createLSOutput();
-      baos.reset();
-      output.setByteStream(baos);
-      writer.write(n, output);
-      serTime += (System.currentTimeMillis() - serStart);
-      LSInput input = impl.createLSInput();
-      input.setByteStream(new ByteArrayInputStream(baos.toByteArray()));
-      long deserStart = System.currentTimeMillis();
-      //XmlUtil.parse(new String(baos.toByteArray()));
-      lsParser.parse(input);
-      deserTime += (System.currentTimeMillis() - deserStart);
-      //checkListEquality(nodes, outs);
-    }
-
-    output("LS Serialization [" + serTime + " ms total: "
-          + ((double)serTime)/numRuns + "ms/run]");
-    output("LS Deserialization [" + deserTime + " ms total: "
-          + ((double)deserTime)/numRuns + "ms/run]");
-  }
-
   public static void main(String[] args) {
     // Test can be run as standalone program to test out serialization and parsing
     // performance numbers, using Caja as a parser.

Added: incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/nekohtml/NekoSimplifiedHtmlParserTest.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/nekohtml/NekoSimplifiedHtmlParserTest.java?rev=710176&view=auto
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/nekohtml/NekoSimplifiedHtmlParserTest.java (added)
+++ incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/nekohtml/NekoSimplifiedHtmlParserTest.java Mon Nov  3 13:41:23 2008
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+package org.apache.shindig.gadgets.parse.nekohtml;
+
+import junit.framework.TestCase;
+import org.apache.shindig.gadgets.parse.ParseModule;
+import org.apache.xml.serialize.HTMLSerializer;
+import org.apache.xml.serialize.OutputFormat;
+import org.w3c.dom.Document;
+
+import java.io.IOException;
+import java.io.StringWriter;
+
+/**
+ * Test behavior of simplified HTML parser
+ */
+public class NekoSimplifiedHtmlParserTest extends TestCase {
+
+  public void testUnbalanced() throws Exception {
+    parseAndCompareBalanced("<html><body><center>content</body></html>",
+        "<html><body><center>content</body></html>");
+  }
+
+  public void testUnbalanced2() throws Exception {
+    parseAndCompareBalanced("<html><body><img>content<img>content</body></html>",
+        "<HTML><body><IMG>content<IMG>content</body></HTML>");
+  }
+
+  public void testUnbalanced3() throws Exception {
+    parseAndCompareBalanced("<html><body><select><option>content<option></body></html>",
+        "<html><body><select><option>content<option></body></html>");
+  }
+
+  public void testUnbalanced4() throws Exception {
+    parseAndCompareBalanced("<html><body>Something awful</html>",
+        "<HTML><body>Something awful</body></HTML>");
+  }
+
+  public void testUnbalanced5() throws Exception {
+    parseAndCompareBalanced("<html><body><br />content<br></html>",
+        "<HTML><body><br />content<br></body></HTML>");
+  }
+
+  private void parseAndCompareBalanced(String content, String expected) throws Exception {
+    NekoSimplifiedHtmlParser builder = new NekoSimplifiedHtmlParser(
+        new ParseModule.DOMImplementationProvider().get());
+    Document document = builder.parseDom(content);
+    StringWriter sw = new StringWriter();
+    OutputFormat outputFormat = new OutputFormat();
+    outputFormat.setPreserveSpace(true);
+    outputFormat.setOmitDocumentType(true);
+    HTMLSerializer serializer = new HTMLSerializer(sw, outputFormat) {
+      protected void characters(String s) throws IOException {
+        this.content();
+        this._printer.printText(s);
+      }
+    };
+    serializer.serialize(document);
+
+    assertEquals(sw.toString().toLowerCase(), expected.toLowerCase());
+  }
+}

Modified: incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/JsTagConcatContentRewriterTest.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/JsTagConcatContentRewriterTest.java?rev=710176&r1=710175&r2=710176&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/JsTagConcatContentRewriterTest.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/JsTagConcatContentRewriterTest.java Mon Nov  3 13:41:23 2008
@@ -49,9 +49,9 @@
   }
 
   public void testJSMergePreserveNoExternal() throws Exception {
-    String s = "<SCRIPT>\n"
+    String s = "<script>\n"
         + "doSomething\n"
-        + "</SCRIPT>";
+        + "</script>";
 
     Document document = htmlParser.parseDom(s);
     String rewritten = rewriteHelper(rewriter, s, document);
@@ -67,81 +67,81 @@
   }
 
   public void testJSMergePreserveWithComment() throws Exception {
-    String s = "<SCRIPT>" +
+    String s = "<script>" +
         "<!--\ndoSomething\n-->" +
-        "</SCRIPT>";
+        "</script>";
     Document document = htmlParser.parseDom(s);
     String rewritten = rewriteHelper(rewriter, s, document);
     assertEquals(rewritten, s);
   }
 
   public void testJSMergeSingleScriptReWrite() throws Exception {
-    String s = "<SCRIPT src=\"http://a.b.com/1.js\"></SCRIPT>";
-    String expected = "<SCRIPT src=\"" + concatBase + "1=http%3A%2F%2Fa.b.com%2F1.js\"></SCRIPT>";
+    String s = "<script src=\"http://a.b.com/1.js\"></script>";
+    String expected = "<script src=\"" + concatBase + "1=http%3A%2F%2Fa.b.com%2F1.js\"></script>";
     Document document = htmlParser.parseDom(s);
     String rewritten = rewriteHelper(rewriter, s, document);
     assertEquals(rewritten, expected);
   }
 
   public void testJSMergeTwoScriptReWriteWithWhitespace() throws Exception {
-    String s = "<SCRIPT src=\"http://a.b.com/1.js\"></SCRIPT>"
-        + "<SCRIPT src=\"http://a.b.com/2.js\"></SCRIPT>";
+    String s = "<script src=\"http://a.b.com/1.js\"></script>"
+        + "<script src=\"http://a.b.com/2.js\"></script>";
     String expected
-        = "<SCRIPT src=\"" + concatBase + "1=http%3A%2F%2Fa.b.com%2F1.js&2=http%3A%2F%2Fa.b.com%2F2.js\"></SCRIPT>";
+        = "<script src=\"" + concatBase + "1=http%3A%2F%2Fa.b.com%2F1.js&2=http%3A%2F%2Fa.b.com%2F2.js\"></script>";
     Document document = htmlParser.parseDom(s);
     String rewritten = rewriteHelper(rewriter, s, document);
     assertEquals(rewritten, expected);
   }
 
   public void testJSMergeLeadAndTrailingScriptReWrite() throws Exception {
-    String s = "<SCRIPT>\n"
+    String s = "<script>\n"
         + "doSomething\n"
-        + "</SCRIPT>"
-        + "<SCRIPT src=\"http://a.b.com/1.js\"></SCRIPT>"
-        + "<SCRIPT src=\"http://a.b.com/2.js\"></SCRIPT>"
-        + "<SCRIPT>\n"
+        + "</script>"
+        + "<script src=\"http://a.b.com/1.js\"></script>"
+        + "<script src=\"http://a.b.com/2.js\"></script>"
+        + "<script>\n"
         + "doSomething\n"
-        + "</SCRIPT>";
-    String expected = "<SCRIPT>\n"
+        + "</script>";
+    String expected = "<script>\n"
         + "doSomething\n"
-        + "</SCRIPT>"
-        + "<SCRIPT src=\"" + concatBase + "1=http%3A%2F%2Fa.b.com%2F1.js&2=http%3A%2F%2Fa.b.com%2F2.js\"></SCRIPT>"
-        + "<SCRIPT>\n"
+        + "</script>"
+        + "<script src=\"" + concatBase + "1=http%3A%2F%2Fa.b.com%2F1.js&2=http%3A%2F%2Fa.b.com%2F2.js\"></script>"
+        + "<script>\n"
         + "doSomething\n"
-        + "</SCRIPT>";
+        + "</script>";
     Document document = htmlParser.parseDom(s);
     String rewritten = rewriteHelper(rewriter, s, document);
     assertEquals(rewritten, expected);
   }
 
   public void testJSMergeInterspersed() throws Exception {
-    String s = "<SCRIPT src=\"http://a.b.com/1.js\"></SCRIPT>"
-        + "<SCRIPT src=\"http://a.b.com/2.js\"></SCRIPT>"
-        + "<SCRIPT><!-- doSomething --></SCRIPT>"
-        + "<SCRIPT src=\"http://a.b.com/3.js\"></SCRIPT>"
-        + "<SCRIPT src=\"http://a.b.com/4.js\"></SCRIPT>";
+    String s = "<script src=\"http://a.b.com/1.js\"></script>"
+        + "<script src=\"http://a.b.com/2.js\"></script>"
+        + "<script><!-- doSomething --></script>"
+        + "<script src=\"http://a.b.com/3.js\"></script>"
+        + "<script src=\"http://a.b.com/4.js\"></script>";
     String expected =
-        "<SCRIPT src=\"" + concatBase + "1=http%3A%2F%2Fa.b.com%2F1.js&2=http%3A%2F%2Fa.b.com%2F2.js\"></SCRIPT>" +
-        "<SCRIPT><!-- doSomething --></SCRIPT>" +
-        "<SCRIPT src=\"" + concatBase + "1=http%3A%2F%2Fa.b.com%2F3.js&2=http%3A%2F%2Fa.b.com%2F4.js\"></SCRIPT>";
+        "<script src=\"" + concatBase + "1=http%3A%2F%2Fa.b.com%2F1.js&2=http%3A%2F%2Fa.b.com%2F2.js\"></script>" +
+        "<script><!-- doSomething --></script>" +
+        "<script src=\"" + concatBase + "1=http%3A%2F%2Fa.b.com%2F3.js&2=http%3A%2F%2Fa.b.com%2F4.js\"></script>";
     Document document = htmlParser.parseDom(s);
     String rewritten = rewriteHelper(rewriter, s, document);
     assertEquals(expected, rewritten);
   }
 
   public void testJSMergeDerelativizeHostRelative() throws Exception {
-    String s = "<SCRIPT src=\"/1.js\"></SCRIPT>";
+    String s = "<script src=\"/1.js\"></script>";
     String expected
-        = "<SCRIPT src=\"" + concatBase + "1=http%3A%2F%2Fgadget.org%2F1.js\"></SCRIPT>";
+        = "<script src=\"" + concatBase + "1=http%3A%2F%2Fgadget.org%2F1.js\"></script>";
     Document document = htmlParser.parseDom(s);
     String rewritten = rewriteHelper(rewriter, s, document);
     assertEquals(rewritten, expected);
   }
 
   public void testJSMergeDerelativizePathRelative() throws Exception {
-    String s = "<SCRIPT src=\"1.js\"></SCRIPT>";
+    String s = "<script src=\"1.js\"></script>";
     String expected
-        = "<SCRIPT src=\"" + concatBase + "1=http%3A%2F%2Fgadget.org%2Fdir%2F1.js\"></SCRIPT>";
+        = "<script src=\"" + concatBase + "1=http%3A%2F%2Fgadget.org%2Fdir%2F1.js\"></script>";
     Document document = htmlParser.parseDom(s);
     String rewritten = rewriteHelper(rewriter, s, document);
     assertEquals(rewritten, expected);

Added: incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/LexerVsDomRewriteBenchmark.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/LexerVsDomRewriteBenchmark.java?rev=710176&view=auto
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/LexerVsDomRewriteBenchmark.java (added)
+++ incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/LexerVsDomRewriteBenchmark.java Mon Nov  3 13:41:23 2008
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.shindig.gadgets.rewrite;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.shindig.gadgets.parse.GadgetHtmlParser;
+import org.apache.shindig.gadgets.parse.HtmlSerializer;
+import org.apache.shindig.gadgets.parse.ParseModule;
+import org.apache.shindig.gadgets.parse.caja.CajaHtmlParser;
+import org.apache.shindig.gadgets.parse.nekohtml.NekoHtmlParser;
+import org.apache.shindig.gadgets.parse.nekohtml.NekoSimplifiedHtmlParser;
+import org.apache.shindig.gadgets.rewrite.lexer.HtmlRewriter;
+import org.apache.shindig.gadgets.rewrite.lexer.HtmlTagTransformer;
+import org.apache.shindig.gadgets.rewrite.lexer.LinkingTagRewriter;
+import org.w3c.dom.Document;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.net.URI;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Compare performance of lexer rewriter and dom rewriter.
+ */
+public class LexerVsDomRewriteBenchmark {
+
+  private int numRuns;
+  private String content;
+
+  private GadgetHtmlParser cajaParser = new CajaHtmlParser(
+      new ParseModule.DOMImplementationProvider().get());
+
+  private GadgetHtmlParser nekoParser = new NekoHtmlParser(
+      new ParseModule.DOMImplementationProvider().get());
+
+  private GadgetHtmlParser nekoSimpleParser = new NekoSimplifiedHtmlParser(
+      new ParseModule.DOMImplementationProvider().get());
+
+  // Caja lexer
+  private Map<String, HtmlTagTransformer> defaultTransformerMap;
+  private URI dummyUri;
+
+  private LinkingTagContentRewriter domRewriter;
+  private boolean warmup;
+
+  private LexerVsDomRewriteBenchmark(String file, int numRuns) throws Exception {
+    File inputFile = new File(file);
+    if (!inputFile.exists() || !inputFile.canRead()) {
+      System.err.println("Input file: " + file + " not found or can't be read.");
+      System.exit(1);
+    }
+
+     LinkRewriter linkRewriter = new LinkRewriter() {
+      public String rewrite(String link, URI context) {
+        return link;
+      }
+    };
+
+    // Lexer setup
+    dummyUri = new URI("http://www.w3c.org");
+    URI relativeBase = new URI("http://a.b.com/");
+    LinkingTagRewriter lexerRewriter = new LinkingTagRewriter(
+        linkRewriter, new URI("http://a.b.com/"));
+    defaultTransformerMap = new HashMap<String, HtmlTagTransformer>();
+    for (String tag : lexerRewriter.getSupportedTags()) {
+      defaultTransformerMap .put(tag, lexerRewriter);
+    }
+    // End lexer setup
+
+    // DOM setup
+    domRewriter = new LinkingTagContentRewriter(linkRewriter, null);
+    // End DOM setup
+
+    content = new String(IOUtils.toByteArray(new FileInputStream(file)));
+    this.numRuns = numRuns;
+    warmup = true;
+    runLexer();
+    //run(cajaParser);
+    run(nekoParser);
+    run(nekoSimpleParser);
+    Thread.sleep(5000L);
+    warmup = false;
+    System.out.println("Lexer------");
+    runLexer();
+    //System.out.println("Caja-------");
+    //run(cajaParser);
+    System.out.println("Neko-------");
+    run(nekoParser);
+    System.out.println("NekoSimple-------");
+    run(nekoSimpleParser);
+  }
+
+  private void output(String content) {
+    if (!warmup) {
+      System.out.println(content);
+    }
+  }
+
+  private void runLexer() throws Exception {
+   long startTime = System.currentTimeMillis();
+    for (int i = 0; i < numRuns; i++) {
+      HtmlRewriter.rewrite(content, dummyUri, defaultTransformerMap);
+    }
+    long time = System.currentTimeMillis() - startTime;
+    output("Lexer Rewrite [" + time + " ms total: " +
+          ((double)time)/numRuns + "ms/run]");
+  }
+
+  private void run(GadgetHtmlParser parser) throws Exception {
+    long startTime = System.currentTimeMillis();
+    for (int i = 0; i < numRuns; i++) {
+      Document document = parser.parseDom(content);
+      domRewriter.rewrite(document, dummyUri);
+      HtmlSerializer.serialize(document);
+    }
+    long time = System.currentTimeMillis() - startTime;
+    output("DOM Rewrite [" + time + " ms total: " +
+          ((double)time)/numRuns + "ms/run]");
+
+  }
+
+
+  public static void main(String[] args) {
+    // Test can be run as standalone program to test out serialization and parsing
+    // performance numbers, using Caja as a parser.
+    if (args.length != 2) {
+      System.err.println("Args: <input-file> <num-runs>");
+      System.exit(1);
+    }
+
+    String fileArg = args[0];
+    String runsArg = args[1];
+    int numRuns = -1;
+    try {
+      numRuns = Integer.parseInt(runsArg);
+    } catch (Exception e) {
+      System.err.println("Invalid num-runs argument: " + runsArg + ", reason: " + e);
+    }
+    try {
+      new LexerVsDomRewriteBenchmark(fileArg, numRuns);
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+}

Modified: incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/LinkingTagContentRewriterTest.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/LinkingTagContentRewriterTest.java?rev=710176&r1=710175&r2=710176&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/LinkingTagContentRewriterTest.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/LinkingTagContentRewriterTest.java Mon Nov  3 13:41:23 2008
@@ -51,10 +51,10 @@
         + "<IMG src=\"http://a.b.com/img2.gif\"/>"
         + "<eMbeD src=\"http://a.b.com/some.mov\"/>"
         + "<link href=\"http://a.b.com/link.html\"></link>";
-    String expected = "<IMG src=\"" + LINK_PREFIX + "http://a.b.com/img.gif\">"
-        + "<IMG src=\"" + LINK_PREFIX + "http://a.b.com/img2.gif\">"
-        + "<EMBED src=\"" + LINK_PREFIX + "http://a.b.com/some.mov\"></EMBED>"
-        + "<LINK href=\"" + LINK_PREFIX + "http://a.b.com/link.html\">";
+    String expected = "<img src=\"" + LINK_PREFIX + "http://a.b.com/img.gif\">"
+        + "<img src=\"" + LINK_PREFIX + "http://a.b.com/img2.gif\">"
+        + "<embed src=\"" + LINK_PREFIX + "http://a.b.com/some.mov\"></embed>"
+        + "<link href=\"" + LINK_PREFIX + "http://a.b.com/link.html\">";
     Document document = htmlParser.parseDom(s);
     String rewritten = rewriteHelper(rewriter, s, document);
     assertEquals(rewritten, expected);

Modified: incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/StyleLinksContentRewriterTest.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/StyleLinksContentRewriterTest.java?rev=710176&r1=710175&r2=710176&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/StyleLinksContentRewriterTest.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/rewrite/StyleLinksContentRewriterTest.java Mon Nov  3 13:41:23 2008
@@ -18,6 +18,7 @@
  */
 package org.apache.shindig.gadgets.rewrite;
 
+import com.google.common.collect.Sets;
 import com.google.inject.Guice;
 import com.google.inject.Injector;
 import org.apache.shindig.gadgets.parse.GadgetHtmlParser;
@@ -64,7 +65,9 @@
       ".someid {background-image:url(\"" + LINK_PREFIX + "http://a.b.com/bigimg.png\");float:right;width:165px;height:23px;margin-top:4px;margin-left:5px}";
     // Rewrite, document is mutated in-place
     rewriteHelper(rewriter, s, document);
-    assertEquals(rewritten, document.getElementsByTagName("STYLE").item(0).getTextContent());
+    assertEquals(rewritten,
+        HtmlContentRewriter.getElementsByTagNameCaseInsensitive(document,
+            Sets.newHashSet("style")).get(0).getTextContent());
   }
   
   public void testStyleTagRewritesIgnoredOnBadParse() throws Exception {