You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@shindig.apache.org by jo...@apache.org on 2008/10/03 02:25:43 UTC

svn commit: r701267 - in /incubator/shindig/trunk/java/gadgets/src: main/java/org/apache/shindig/gadgets/parse/ test/java/org/apache/shindig/gadgets/parse/

Author: johnh
Date: Thu Oct  2 17:23:28 2008
New Revision: 701267

URL: http://svn.apache.org/viewvc?rev=701267&view=rev
Log:
Adding infrastructure for automatic caching by GadgetHtmlParser implementations. This does not yet affect any existing operation, as it's not included in CajaHtmlParser.

Two classes are provided:
1. AbstractCachingGadgetHtmlParser. A GadgetHtmlParser implementation simply subclasses this with proper configuration (a cache and a minimum amount of time at which it's worth parsing parsed content) to automatically support caching. Content is cached by MD5 of the String inputs.

2. ParseTreeSerializer, with Test. PTS handles conversion between byte[] <-> List<ParsedHtmlNode>. It implements custom serialization routines rather than using Java's built-in serialization for several reasons:
  A) Smaller storage requirements. Testing yielded between 10 and 30% space savings compared to Java serialization, with higher savings correlated with more-structured (fewer Strings) input.
  B) Faster execution time. Custom serialization measured between 30-40% faster than Java serialization, while custom deserialization measured 35-50% faster. For a "typical" sample gadget of ~30kB of content, serialization averaged 2.1ms and deserialization clocked in at an avg of 1.3ms. These compare quite favorably to the CajaHtmlParser's average parsing time of 25ms.
  C) Cross-platform, cross-parser, and easier cross-version (de)serialization. This rationale is weaker but noted for completeness.

Usage of these classes, ideally combined with additional rules preventing hangman variable substitution in non-Text content (Text nodes and Attributes), should allow Shindig to obtain and process a gadget as a parse tree relatively cheaply.


Added:
    incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/AbstractCachingGadgetHtmlParser.java
    incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/ParseTreeSerializer.java
    incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/ParseTreeSerializerTest.java

Added: incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/AbstractCachingGadgetHtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/AbstractCachingGadgetHtmlParser.java?rev=701267&view=auto
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/AbstractCachingGadgetHtmlParser.java (added)
+++ incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/AbstractCachingGadgetHtmlParser.java Thu Oct  2 17:23:28 2008
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+package org.apache.shindig.gadgets.parse;
+
+import org.apache.shindig.common.cache.Cache;
+import org.apache.shindig.common.cache.CacheProvider;
+import org.apache.shindig.common.util.HashUtil;
+import org.apache.shindig.gadgets.GadgetException;
+
+import java.util.List;
+
+/**
+ * Abstract base class for {@code GadgetHtmlParser} classes that provides a caching
+ * layer for parse trees. The cache is provided by a given {@code CacheProvider},
+ * with the configured size. The class is also configured with the minimum parse
+ * time, in milliseconds, for which it's worth caching contents at all. This
+ * is a rough heuristic, but useful to avoid caching contents that are parsed faster
+ * than a cache hit and serialization would take. A value <= 0 turns this feature off.
+ * 
+ * Essentially any real {@code GadgetHtmlParser} should extend this base class, as
+ * its abstract method's signature is identical to the interface.
+ */
+public abstract class AbstractCachingGadgetHtmlParser implements GadgetHtmlParser {
+  protected abstract List<ParsedHtmlNode> doParse(String source) throws GadgetException;
+  
+  private final Cache<String, byte[]> parseTreeCache;
+  private final ParseTreeSerializer pts;
+  private final int cacheTimeMsCutoff;
+  
+  protected AbstractCachingGadgetHtmlParser(CacheProvider cacheProvider, 
+      int capacity, int cacheTimeMsCutoff) {
+    if (cacheProvider != null && capacity > 0) {
+      parseTreeCache = cacheProvider.createCache(capacity);
+    } else {
+      // Cache can be configured to do nothing for test instances, etc.
+      parseTreeCache = new DoNothingCache();
+    }
+    pts = new ParseTreeSerializer();
+    this.cacheTimeMsCutoff = cacheTimeMsCutoff;
+  }
+
+  public List<ParsedHtmlNode> parse(String source) throws GadgetException {
+    // Cache key is MD5 of String
+    String cacheKey = HashUtil.checksum(source.getBytes());
+    byte[] cached = parseTreeCache.getElement(cacheKey);
+    if (cached != null) {
+      return pts.deserialize(cached);
+    }
+    
+    long parseStart = System.currentTimeMillis();
+    List<ParsedHtmlNode> parsed = doParse(source);
+    if (parsed == null) {
+      return null;
+    }
+    
+    if ((System.currentTimeMillis() - parseStart) > cacheTimeMsCutoff) {
+      parseTreeCache.addElement(cacheKey, pts.serialize(parsed));
+    }
+    
+    return parsed;
+  }
+  
+  private static class DoNothingCache implements Cache<String, byte[]> {
+    public void addElement(String key, byte[] value) { }
+    public byte[] getElement(String key) { return null; }
+    public byte[] removeElement(String key) { return null; }
+  }
+}
\ No newline at end of file

Added: incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/ParseTreeSerializer.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/ParseTreeSerializer.java?rev=701267&view=auto
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/ParseTreeSerializer.java (added)
+++ incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/parse/ParseTreeSerializer.java Thu Oct  2 17:23:28 2008
@@ -0,0 +1,327 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+package org.apache.shindig.gadgets.parse;
+
+import java.io.IOException;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Handles serializing and deserializing parse trees (Lists of {@code ParsedHtmlNode} objects)
+ * to and from byte[] form.
+ * 
+ * When deserializing, different underlying objects are actually
+ * provided, but provide identical data behind the ParsedHtmlNode and Attribute interfaces.
+ * This is a side effect of the fact that what's actually serialized are custom-serialized
+ * implementations of these interfaces. This custom implementation provides several benefits
+ * as compared to standard Java serialization, smaller data size, faster processing time,
+ * and cross-platform compatibility.
+ * 
+ * Each serialized object is annotated with a versionID, similar to serialVersionUID used
+ * by Java's serialization mechanism.
+ */
+public class ParseTreeSerializer {
+  private static final int SERIALIZATION_VERSION_ID = 1;
+
+  /**
+   * Convert the list of {@code ParsedHtmlNode} elements into a byte blob suitable
+   * for persisting in a cache.
+   * @param nodes List of parsed html contents.
+   * @return Blob of bytes representing contents.
+   */
+  public byte[] serialize(List<ParsedHtmlNode> nodes) {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    ObjectOutputStream oos = null;
+    try {
+      SerializableParsedNode parent = SerializableParsedNode.makeParent(nodes);
+      parent.writeObject(baos);
+    } catch (IOException e) {
+      // Never happens.
+    }
+    return baos.toByteArray();
+  }
+  
+  /**
+   * Attempt to convert a blob of bytes, likely generated by {@code serialize()},
+   * into a Java representation of parsed content.
+   * @param buffer Byte blob of serialized data.
+   * @return List of nodes.
+   */
+  public List<ParsedHtmlNode> deserialize(byte[] buffer) {
+    ByteArrayInputStream bais = new ByteArrayInputStream(buffer);
+    ObjectInputStream ois = null;
+    List<ParsedHtmlNode> nodes = null;
+    try {
+      SerializableParsedNode spn = SerializableParsedNode.readObject(bais);
+      if (spn == null) {
+        return null;
+      }
+      nodes = spn.getChildren();
+    } catch (IOException e) {
+      // Strange, but no matter: invalidate by returning null.
+    } catch (ClassCastException e) {
+      // Invalidate entry by returning null.
+    }
+    return nodes;
+  }
+  
+  // Helper method: converts input list to a list of SerializableParsedNodes
+  private static List<SerializableParsedNode> getSerializableNodeList(List<ParsedHtmlNode> nodes) {
+    if (nodes == null) {
+      return null;
+    }
+    List<SerializableParsedNode> outNodes = new ArrayList<SerializableParsedNode>(nodes.size());
+    for (ParsedHtmlNode node : nodes) {
+      outNodes.add(new SerializableParsedNode(node));
+    }
+    return outNodes;
+  }
+  
+  // Helper method: converts input list to a list of SerializableParsedAttribs
+  private static List<SerializableParsedAttrib> getSerializableAttribList(
+      List<ParsedHtmlAttribute> attribs) {
+    if (attribs == null) {
+      return null;
+    }
+    List<SerializableParsedAttrib> outAttribs = new ArrayList<SerializableParsedAttrib>(attribs.size());
+    for (ParsedHtmlAttribute attrib : attribs) {
+      outAttribs.add(new SerializableParsedAttrib(attrib));
+    }
+    return outAttribs;
+  }
+  
+  // Serialization helper: writes an integer to an OutputStream
+  private static void writeInt(OutputStream out, int num) throws IOException {
+    out.write(num >> 0);
+    out.write(num >> 8);
+    out.write(num >> 16);
+    out.write(num >> 24);
+  }
+  
+  // Serialization helper: reads an integer written by writeInt from an InputStream
+  private static int readInt(InputStream in) throws IOException {
+    return in.read() << 0 |
+           in.read() << 8 |
+           in.read() << 16 |
+           in.read() << 24;
+  }
+
+  // Serialization helper: write a String to OutputStream. Uses UTF8 format for all Strings
+  private static void writeString(OutputStream out, String str) throws IOException {
+    if (str == null) {
+      writeInt(out, 0);
+      return;
+    }
+    byte[] utf8bytes = str.getBytes("UTF8");
+    writeInt(out, utf8bytes.length);
+    out.write(utf8bytes, 0, utf8bytes.length);
+  }
+  
+  // Serialization helper: reads a writeString()-written String from the given InputStream
+  private static String readString(InputStream in) throws IOException {
+    int len = readInt(in);
+    if (len == 0) {
+      return null;
+    }
+    byte[] stringBytes = new byte[len];
+    int read = 0;
+    while (read < len) {
+      int thisTime = in.read(stringBytes, read, (len - read));
+      if (thisTime == -1) {
+        throw new IOException("Insufficient data in buffer to read");
+      }
+      read += thisTime;
+    }
+    return new String(stringBytes, "UTF8");
+  }
+  
+  //
+  // ParsedHtmlNode implementation providing custom serialization routines.
+  // This class is used for both serializing and deserializing ParsedHtmlNode lists.
+  //
+  private static class SerializableParsedNode implements ParsedHtmlNode {
+    private String tag;
+    private List<SerializableParsedAttrib> sattribs;
+    private List<ParsedHtmlAttribute> attribs;
+    private List<SerializableParsedNode> schildren;
+    private List<ParsedHtmlNode> children;
+    private String text;
+
+    /** {@inheritDoc} */
+    public List<ParsedHtmlAttribute> getAttributes() {
+      if (attribs == null) {
+        attribs = new ArrayList<ParsedHtmlAttribute>(this.sattribs.size());
+        attribs.addAll(this.sattribs);
+      }
+      return attribs;
+    }
+
+    /** {@inheritDoc} */
+    public List<ParsedHtmlNode> getChildren() {
+      if (children == null) {
+        children = new ArrayList<ParsedHtmlNode>(this.schildren.size());
+        children.addAll(this.schildren);
+      }
+      return children;
+    }
+
+    /** {@inheritDoc} */
+    public String getTagName() {
+      return tag;
+    }
+
+    /** {@inheritDoc} */
+    public String getText() {
+      return text;
+    }
+    
+    // Key helper method, responsible for writing this object to OutputStream as bytes.
+    private void writeObject(OutputStream out) throws IOException {
+      // Format: version, then isTag, followed by fields.
+      writeInt(out, SERIALIZATION_VERSION_ID);
+      
+      if (tag != null) {
+        // isTag
+        out.write(1);
+        
+        // Tag: length and bytes
+        writeString(out, tag);
+        
+        // Attribs: length (as short) and each attrib
+        writeInt(out, sattribs.size()); 
+        for (SerializableParsedAttrib attrib : sattribs) {
+          attrib.writeObject(out);
+        }
+        
+        // Children: length and each child
+        writeInt(out, schildren.size());
+        for (SerializableParsedNode node : schildren) {
+          node.writeObject(out);
+        }
+      } else {
+        // isTag
+        out.write(0);
+        
+        // Text: length and bytes
+        writeString(out, text);
+      }
+    }
+
+    // Read the object from the InputStream, ensuring its byte representation
+    // is compatible with the current deserialization routine.
+    private static SerializableParsedNode readObject(InputStream in) throws IOException {
+      int writtenVersion = readInt(in);
+      if (writtenVersion != SERIALIZATION_VERSION_ID) {
+        // Now-invalid entry. Serialization format has changed.
+        return null;
+      }
+      
+      boolean isTag = in.read() != 0;
+      
+      if (isTag) {
+        // Read tag and iteratively read each attrib and child.
+        String tag = readString(in);
+        
+        try {
+          int attribLen = readInt(in);
+          List<SerializableParsedAttrib> sattribs = new ArrayList<SerializableParsedAttrib>(attribLen);
+          for (int i = 0; i < attribLen; ++i) {
+            sattribs.add(SerializableParsedAttrib.readObject(in));
+          }
+        
+          int childLen = readInt(in);
+          List<SerializableParsedNode> schildren = new ArrayList<SerializableParsedNode>(childLen);
+          for (int i = 0; i < childLen; ++i) {
+            schildren.add(SerializableParsedNode.readObject(in));
+          }
+          
+          return new SerializableParsedNode(tag, sattribs, schildren); 
+        } catch (ClassCastException e) {
+          throw new IOException("Class cast exception reading object: " + e);
+        }
+      } else {
+        // Just read the text field.
+        return new SerializableParsedNode(readString(in));
+      }
+    }
+    
+    // Create a wrapper node as a convenience for serializing a list of nodes.
+    private static SerializableParsedNode makeParent(List<ParsedHtmlNode> kids) {
+      return new SerializableParsedNode("x",
+          new ArrayList<SerializableParsedAttrib>(), getSerializableNodeList(kids));
+    }
+    
+    // Constructors: from generic ParsedHtmlNode, and for a new Tag or Text type node.
+    private SerializableParsedNode(ParsedHtmlNode source) {
+      this.tag = source.getTagName();
+      this.sattribs = getSerializableAttribList(source.getAttributes());
+      this.schildren = getSerializableNodeList(source.getChildren());
+      this.text = source.getText();
+    }
+    
+    private SerializableParsedNode(String tag,
+        List<SerializableParsedAttrib> sattribs, List<SerializableParsedNode> schildren) {
+      this.tag = tag;
+      this.sattribs = sattribs;
+      this.schildren = schildren;
+    }
+    
+    private SerializableParsedNode(String text) {
+      this.text = text;
+    }
+  }
+  
+  // Attribute equivalent of SerializableParsedNode
+  private static class SerializableParsedAttrib implements ParsedHtmlAttribute {
+    private String name;
+    private String value;
+    
+    private SerializableParsedAttrib(ParsedHtmlAttribute source) {
+      this.name = source.getName();
+      this.value = source.getValue();
+    }
+    
+    private SerializableParsedAttrib(String name, String value) {
+      this.name = name;
+      this.value = value;
+    }
+
+    public String getName() {
+      return name;
+    }
+
+    public String getValue() {
+      return value;
+    }
+    
+    public void writeObject(OutputStream out) throws IOException {
+      writeString(out, name);
+      writeString(out, value);
+    }
+    
+    private static SerializableParsedAttrib readObject(InputStream in) throws IOException {
+      return new SerializableParsedAttrib(readString(in), readString(in));
+    }
+  }
+}

Added: incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/ParseTreeSerializerTest.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/ParseTreeSerializerTest.java?rev=701267&view=auto
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/ParseTreeSerializerTest.java (added)
+++ incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/parse/ParseTreeSerializerTest.java Thu Oct  2 17:23:28 2008
@@ -0,0 +1,268 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+package org.apache.shindig.gadgets.parse;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import org.apache.shindig.gadgets.parse.caja.CajaHtmlParser;
+import org.junit.Test;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.util.Arrays;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * Tests serialization and deserialization of parse trees.
+ */
+public class ParseTreeSerializerTest {
+  private static ParseTreeSerializer pts = new ParseTreeSerializer();
+  
+  public static void main(String[] args) throws Exception {
+    // Test can be run as standalone program to test out serialization and parsing
+    // performance numbers, using Caja as a parser.
+    if (args.length != 2) {
+      System.err.println("Args: <input-file> <num-runs>");
+      System.exit(1);
+    }
+    
+    String fileArg = args[0];
+    File inputFile = new File(fileArg);
+    if (!inputFile.exists() || !inputFile.canRead()) {
+      System.err.println("Input file: " + fileArg + " not found or can't be read.");
+      System.exit(1);
+    }
+    
+    String runsArg = args[1];
+    int numRuns = -1;
+    try {
+      numRuns = Integer.parseInt(runsArg);
+    } catch (Exception e) {
+      System.err.println("Invalid num-runs argument: " + runsArg + ", reason: " + e);
+    }
+    
+    FileInputStream fis = new FileInputStream(inputFile);
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    byte[] buf = new byte[65535];
+    int read = -1;
+    while ((read = fis.read(buf)) > 0) {
+      baos.write(buf, 0, read);
+    }
+    String inputData = new String(baos.toByteArray());
+    
+    // Caja parser.
+    System.out.println("Parsing contents of '" + fileArg + "' " + numRuns + " times...");
+    CajaHtmlParser parser = new CajaHtmlParser();
+    long parseStart = System.currentTimeMillis();
+    List<ParsedHtmlNode> nodes = null;
+    for (int i = 0; i < numRuns; ++i) {
+      nodes = parser.parse(inputData);
+    }
+    long parseMillis = System.currentTimeMillis() - parseStart;
+    
+    // Serializer/deserializer
+    System.out.println("Serializing and deserializing results of Caja run (" +
+        nodes.size() + " top-level nodes, " + numRuns + " runs)\n");
+    long serTime = 0, deserTime = 0;
+    for (int i = 0; i < numRuns; ++i) {
+      long serStart = System.currentTimeMillis();
+      byte[] ser = pts.serialize(nodes);
+      serTime += (System.currentTimeMillis() - serStart);
+      long deserStart = System.currentTimeMillis();
+      List<ParsedHtmlNode> outs = pts.deserialize(ser);
+      deserTime += (System.currentTimeMillis() - deserStart);
+      checkListEquality(nodes, outs);
+    }
+    
+    System.out.println("Parsing [" + parseMillis + " ms total: " + 
+        ((double)parseMillis)/numRuns + "ms/run]");
+    System.out.println("Serialization [" + serTime + " ms total: "
+        + ((double)serTime)/numRuns + "ms/run]");
+    System.out.println("Deserialization [" + deserTime + " ms total: "
+        + ((double)deserTime)/numRuns + "ms/run]");
+  }
+  
+  @Test
+  public void fromTestTreeToBytesAndBack() throws Exception {
+    List<ParsedHtmlNode> nodes = new LinkedList<ParsedHtmlNode>();
+    nodes.add(getEverythingNode());
+    nodes.add(getEverythingNode());
+    checkSerializationPasses(nodes);
+  }
+  
+  @Test
+  public void cantDeserializeDifferentVersion() throws Exception {
+    List<ParsedHtmlNode> nodes = new LinkedList<ParsedHtmlNode>();
+    nodes.add(getEverythingNode());
+    byte[] serialized = pts.serialize(nodes);
+    List<ParsedHtmlNode> back = pts.deserialize(serialized);
+    checkListEquality(nodes, back);
+    
+    // This never happens in a given run of code, but is used to simulate
+    // the version number of cached data getting out of sync with processing code.
+    serialized[0]++;
+    assertNull(pts.deserialize(serialized));
+  }
+  
+  @Test
+  public void fromCajaTreeToBytesAndBack() throws Exception {
+    String bigHTML = "";
+    for (int i = 0; i < 100; ++i) {
+      bigHTML += "<parent pkey=\"pval\">parentText<child ckey=\"cval\">childText</child></parent>";
+    }
+    checkSerializationPasses(new CajaHtmlParser().parse(bigHTML));
+  }
+
+  private ParsedHtmlNode getEverythingNode() {
+    // Return node containing a text node and a child node with attributes.
+    ParsedHtmlNode childText = TestParsedHtmlNode.getText("childText");
+    String[] childNVs = { "child", "cval" };
+    ParsedHtmlNode[] childChildren = { childText };
+    ParsedHtmlNode child = TestParsedHtmlNode.getTag("childNode", childNVs, childChildren);
+    
+    ParsedHtmlNode parentText = TestParsedHtmlNode.getText("parentText");
+    String[] parentNVs = { "parent", "pval" };
+    ParsedHtmlNode[] children = { child };
+    return TestParsedHtmlNode.getTag("parentNode", parentNVs, children);
+  }
+  
+  private static void checkSerializationPasses(List<ParsedHtmlNode> raw) throws Exception {
+    byte[] serialized = pts.serialize(raw);
+    List<ParsedHtmlNode> fromTheDead = pts.deserialize(serialized);
+    checkListEquality(raw, fromTheDead);
+  }
+  
+  private static void checkListEquality(List<ParsedHtmlNode> raw, List<ParsedHtmlNode> outs) {
+    List<ParsedHtmlNode> rawTestable = new LinkedList<ParsedHtmlNode>();
+    for (ParsedHtmlNode rawNode : raw) {
+      rawTestable.add(TestParsedHtmlNode.get(rawNode));
+    }
+    List<ParsedHtmlNode> outTestable = new LinkedList<ParsedHtmlNode>();
+    for (ParsedHtmlNode inNode : outs) {
+      outTestable.add(TestParsedHtmlNode.get(inNode));
+    }
+    assertEquals(rawTestable, outTestable);
+  }
+  
+  // Test class providing both a fake ParsedHtmlNode class as well as
+  // one that provides equality testing for ParsedHtmlNodes of any provenance
+  private static class TestParsedHtmlNode implements ParsedHtmlNode {
+    private String tag;
+    private String text;
+    private List<ParsedHtmlAttribute> attribs;
+    private List<ParsedHtmlNode> children;
+    
+    public static ParsedHtmlNode get(ParsedHtmlNode in) {
+      TestParsedHtmlNode node = new TestParsedHtmlNode();
+      node.text = in.getText();
+      if (node.text == null) {
+        node.tag = in.getTagName();
+        node.attribs = new LinkedList<ParsedHtmlAttribute>();
+        for (ParsedHtmlAttribute pha : in.getAttributes()) {
+          node.attribs.add(new TestParsedHtmlAttribute(pha.getName(), pha.getValue()));
+        }
+        node.children = new LinkedList<ParsedHtmlNode>();
+        for (ParsedHtmlNode child : in.getChildren()) {
+          node.children.add(TestParsedHtmlNode.get(child));
+        }
+      }
+      return node;
+    }
+    
+    public static ParsedHtmlNode getTag(String tag, String[] nvpairs, ParsedHtmlNode[] children) {
+      TestParsedHtmlNode node = new TestParsedHtmlNode();
+      node.tag = tag;
+      node.attribs = new LinkedList<ParsedHtmlAttribute>();
+      for (int i = 0; i < nvpairs.length; i += 2) {
+        node.attribs.add(new TestParsedHtmlAttribute(nvpairs[i], nvpairs[i+1]));
+      }
+      // Just in case somehow Arrays.asList() doesn't return a List subclassing
+      // AbstractList (whose .equals() method doesn't check list type)
+      node.children = new LinkedList<ParsedHtmlNode>();
+      node.children.addAll(Arrays.asList(children));
+      return node;
+    }
+    
+    public static ParsedHtmlNode getText(String text) {
+      TestParsedHtmlNode node = new TestParsedHtmlNode();
+      node.text = text;
+      return node;
+    }
+
+    public List<ParsedHtmlAttribute> getAttributes() {
+      return attribs;
+    }
+
+    public List<ParsedHtmlNode> getChildren() {
+      return children;
+    }
+
+    public String getTagName() {
+      return tag;
+    }
+
+    public String getText() {
+      return text;
+    }
+    
+    @Override
+    public boolean equals(Object other) {
+      if (!(other instanceof TestParsedHtmlNode)) {
+        return false;
+      }
+      TestParsedHtmlNode onode = (TestParsedHtmlNode)other;
+      if (this.text != null) {
+        return this.text.equals(onode.text);
+      }
+      return (this.tag.equals(onode.tag) &&
+              this.attribs.equals(onode.attribs) &&
+              this.children.equals(onode.children));
+    }
+  }
+  
+  private static class TestParsedHtmlAttribute implements ParsedHtmlAttribute {
+    private String name;
+    private String value;
+    
+    private TestParsedHtmlAttribute(String name, String value) {
+      this.name = name;
+      this.value = value;
+    }
+
+    public String getName() {
+      return name;
+    }
+
+    public String getValue() {
+      return value;
+    }
+    
+    @Override
+    public boolean equals(Object other) {
+      if (!(other instanceof TestParsedHtmlAttribute)) {
+        return false;
+      }
+      TestParsedHtmlAttribute oattr = (TestParsedHtmlAttribute)other;
+      return (this.name.equals(oattr.name) &&
+              this.value.equals(oattr.value));
+    }
+  }
+}