You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by no...@apache.org on 2009/09/18 12:32:15 UTC
svn commit: r816577 -
/lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java
Author: noble
Date: Fri Sep 18 10:32:15 2009
New Revision: 816577
URL: http://svn.apache.org/viewvc?rev=816577&view=rev
Log:
SOLR-1437 . javadocs added. skipTag inlined and use loop instead of recursion
Modified:
lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java
Modified: lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java?rev=816577&r1=816576&r2=816577&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java Fri Sep 18 10:32:15 2009
@@ -28,35 +28,51 @@
/**
* <p>
- * A streaming xpath parser which uses StAX for XML parsing. It supports only a
- * subset of xpath syntax.
- * </p>
+ * A streaming xpath parser which uses StAX for XML parsing. It supports only
+ * a subset of xpath syntax.
+ * </p><pre>
* /a/b/subject[@qualifier='fullTitle']
+ * /a/b/subject[@qualifier=]/subtag
* /a/b/subject/@qualifier
* /a/b/c
- *
+ * </pre>
* Keep in mind that the wild-card syntax '//' is not supported
+ * A record is a Map<String,Object> . The key is the provided name
+ * and the value is a String or a List<String>
*
+ * This class is thread-safe for parsing xml. But adding fields is not
+ * thread-safe. The recommended usage is to addField() in one thread and
+ * then share the instance across threads.
+ * </p>
* <p/>
* <b>This API is experimental and may change in the future.</b>
- * This class is thread-safe for parsing xml . But adding fields is not thread-safe. The recommended usage is
- * to addField() in one thread and then share the instance across threads.
- *
+ * <p>
* @version $Id$
* @since solr 1.3
*/
public class XPathRecordReader {
private Node rootNode = new Node("/", null);
- /**Use this flag in the addField() method to fetch all the cdata under a specific tag
- *
+ /**
+ * The FLATTEN flag indicates that all text and cdata under a specific
+ * tag should be recursivly fetched and appended to the current Node's
+ * value.
*/
public static final int FLATTEN = 1;
/**
- * @param forEachXpath The XPATH for which a record is emitted. At the start of this xpath tag, it starts collecting the fields and at the close
- * of the tag ,a record is emitted and the fields collected since the tag start is included in the record. If there
- * are fields collected in the parent tag(s) they also will be included in the record but not cleared after emitting the record.
- * It can use the ' | ' syntax of XPATH to pass in multiple xpaths.
+ * A constructor called with a '|' seperated list of Xpath expressions
+ * which define sub sections of the XML stream that are to be emitted
+ * seperate records.
+ *
+ * @param forEachXpath The XPATH for which a record is emitted. Once the
+ * xpath tag is encountered, the Node.parse method starts collecting wanted
+ * fields and at the close of the tag, a record is emitted containing all
+ * fields collected since the tag start. Once
+ * emitted the collected fields are cleared. Any fields collected in the parent tag or above
+ * will also be included in the record, but these are not
+ * cleared after emitting the record.
+
+ * It uses the ' | ' syntax of XPATH to pass in multiple xpaths.
*/
public XPathRecordReader(String forEachXpath) {
String[] splits = forEachXpath.split("\\|");
@@ -64,10 +80,21 @@
split = split.trim();
if (split.length() == 0)
continue;
+ // The created Node has a name set to the full forEach attribute xpath
addField0(split, split, false, true, 0);
}
}
+ /**
+ * A wrapper around {@link #addField0 addField0()} to create a series of Nodes
+ * based on the supplied Xpath for the given fieldName. The created nodes
+ * are inserted into a Node tree.
+ *
+ * @param name The name for this field in the emitted record
+ * @param xpath The xpath expression for this field
+ * @param multiValued If 'true' then the emitted record will have values in
+ * a List<String>
+ */
public synchronized XPathRecordReader addField(String name, String xpath, boolean multiValued) {
if (!xpath.startsWith("/"))
throw new RuntimeException("xpath must start with '/' : " + xpath);
@@ -75,11 +102,16 @@
return this;
}
- /**Add a field's XPATH and its name.
- * @param name . The name by which this field is referred in the emitted record
- * @param xpath . The xpath to this field
- * @param multiValued . If this is 'true' , then the emitted record will have a List<String> as value
- * @param flags . The only supported flag is 'FLATTEN'
+ /**
+ * A wrapper around {@link #addField0 addField0()} to create a series of Nodes
+ * based on the supplied Xpath for the given fieldName. The created nodes
+ * are inserted into a Node tree.
+ *
+ * @param name The name for this field in the emitted record
+ * @param xpath The xpath expression for this field
+ * @param multiValued If 'true' then the emitted record will have values in
+ * a List<String>
+ * @param flags FLATTEN: Recursivly combine text from all child XML elements
*/
public synchronized XPathRecordReader addField(String name, String xpath, boolean multiValued, int flags) {
if (!xpath.startsWith("/"))
@@ -88,6 +120,18 @@
return this;
}
+ /**
+ * Splits the XPATH into a List of xpath segments and calls build() to
+ * construct a tree of Nodes representing xpath segments. The resulting
+ * tree structure ends up describing all the Xpaths we are interested in.
+ *
+ * @param xpath The xpath expression for this field
+ * @param name The name for this field in the emitted record
+ * @param multiValued If 'true' then the emitted record will have values in
+ * a List<String>
+ * @param isRecord When 'true' flags that this XPATH is from a forEach statement
+ * @param flags The only supported flag is 'FLATTEN'
+ */
private void addField0(String xpath, String name, boolean multiValued,
boolean isRecord, int flags) {
List<String> paths = splitEscapeQuote(xpath);
@@ -96,6 +140,14 @@
rootNode.build(paths, name, multiValued, isRecord, flags);
}
+ /**
+ * Uses {@link #streamRecords streamRecords} to parse the XML source but
+ * collects the emitted records into a List which is returned upon completion.
+ *
+ * @param r the stream reader
+ * @return results a List of emitted records
+ *
+ */
public List<Map<String, Object>> getAllRecords(Reader r) {
final List<Map<String, Object>> results = new ArrayList<Map<String, Object>>();
streamRecords(r, new Handler() {
@@ -106,8 +158,12 @@
return results;
}
- /** Stream records as and when they are colected
- * @param r The reader
+ /**
+ * Creates an XML stream reader on top of whatever reader has been
+ * configured. Then calls parse() with a handler which is
+ * invoked forEach record emitted.
+ *
+ * @param r the stream reader
* @param handler The callback instance
*/
public void streamRecords(Reader r, Handler handler) {
@@ -120,60 +176,73 @@
}
}
- /**For each node/leaf in the tree there is one object of this class
+
+ /**
+ * For each node/leaf in the Node tree there is one object of this class.
+ * This tree of objects represents all the XPaths we are interested in.
+ * For each Xpath segment of interest we create a node. In most cases the
+ * node (branch) is rather basic , but for the final portion (leaf) of any Xpath we add
+ * more information to the Node. When parsing the XML document we
+ * step though this tree as we stream records from the reader. If the XML
+ * document departs from this tree we skip start tags till we are back on
+ * the tree.
+ *
*/
private class Node {
- /**name of the tag/attribute*/
- String name;
-
- /**The field name as passed in the addField() . This will be used in the record*/
- String fieldName;
- /**stores the xpath name such as '@attr='xyz'*/
- String xpathName;
- /**The xpath of the record. if this is a record node */
- String forEachPath;
- /**child attribute nodes */
- List<Node> attributes;
- /**child nodes*/
- List<Node> childNodes;
- /**if attribs are used in the xpath their names and values*/
+ String name; // genrally: segment of the Xpath represented by this Node
+ String fieldName; // the fieldname in the emitted record (key of the map)
+ String xpathName; // the segment of the Xpath represented by this Node
+ String forEachPath; // the full Xpath from the forEach entity attribute
+ List<Node> attributes; // a List of attribute Nodes associated with this Node
+ List<Node> childNodes; // a List of child Nodes of this node
List<Map.Entry<String, String>> attribAndValues;
+ Node parent; // parent Node in the tree
+ boolean hasText=false; // flag: store/emit streamed text for this node
+ boolean multiValued=false; //flag: this fields values are returned as a List
+ boolean isRecord=false; //flag: this Node starts a new record
+ private boolean flatten; //flag: child text is also to be emitted
- /**Parent node of this node */
- Node parent;
-
- boolean hasText = false, multiValued = false, isRecord = false;
-
- private boolean flatten;
public Node(String name, Node p) {
+ // Create a basic Node, suitable for the mid portions of any Xpath.
+ // Node.xpathName and Node.name are set to same value
xpathName = this.name = name;
parent = p;
}
public Node(String name, String fieldName, boolean multiValued) {
- this.name = name;
- this.fieldName = fieldName;
- this.multiValued = multiValued;
+ // This is only called from build() when describing an attribute.
+ this.name = name; // a segment from the Xpath
+ this.fieldName = fieldName; // name to store collected values against
+ this.multiValued = multiValued; // return collected values in a List
}
- /**This is the method where all the parsing happens. For each tag/subtag this gets called recursively.
+ /**
+ * This is the method where all the XML parsing happens. For each
+ * tag/subtag read from the source, this method is called recursively.
+ *
*/
private void parse(XMLStreamReader parser, Handler handler,
Map<String, Object> values, Stack<Set<String>> stack,
boolean recordStarted) throws IOException, XMLStreamException {
Set<String> valuesAddedinThisFrame = null;
if (isRecord) {
+ // This Node is a match for an XPATH from a forEach attribute,
+ // prepare to emit a new record when its END_ELEMENT is matched
recordStarted = true;
valuesAddedinThisFrame = new HashSet<String>();
stack.push(valuesAddedinThisFrame);
} else if (recordStarted) {
+ // This node is a child of some parent which matched against forEach
+ // attribute. Continue to add values to an existing record.
valuesAddedinThisFrame = stack.peek();
} else {
+ //if this tag has an attribute or text which is a brank/leaf just push an item up the stack
if (attributes != null || hasText)
valuesAddedinThisFrame = new HashSet<String>();
stack.push(valuesAddedinThisFrame);
}
+
try {
if (attributes != null) {
for (Node node : attributes) {
@@ -184,13 +253,15 @@
}
}
}
+
Set<Node> childrenFound = new HashSet<Node>();
- // for any normal event , parser.next() should be called in each iteration.
- // But for CDATA | CHARACTERS | SPACE it should not do so because handling of
- // CDATA itself would have consumed the next event. CDATA may throw multiple events
- // so all the events are slurped till a START_ELEMENT is encountered.
+ // Internally we have to gobble CDATA | CHARACTERS | SPACE events as we
+ // store text, the gobbling continues till we have fetched some other
+ // event. We use "isNextEventFetched" to indcate that the gobbling has
+ // already fetched the next event.
boolean isNextEventFetched = false;
int event = -1;
+
while (true) {
if (!isNextEventFetched) {
event = parser.next();
@@ -214,6 +285,8 @@
if ((event == CDATA || event == CHARACTERS || event == SPACE)
&& hasText) {
valuesAddedinThisFrame.add(fieldName);
+ // becuase we are fetching events here we need to ensure the outer
+ // loop does not end up doing an extra parser.next()
isNextEventFetched = true;
String text = parser.getText();
event = parser.next();
@@ -236,6 +309,8 @@
}
}
} else {
+ // We are not flatten-ing, so look to see if any of the child
+ // elements are wanted, and recurse if any are found.
handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
}
} else {
@@ -243,6 +318,7 @@
}
event = parser.next();
}
+ // save the text we have read against the fieldName in the Map values
putText(values, text, fieldName, multiValued);
} else if (event == START_ELEMENT) {
handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
@@ -276,8 +352,15 @@
if (n != null) {
childrenFound.add(n);
n.parse(parser, handler, values, stack, recordStarted);
- } else {
- skipTag(parser);
+ }
+ else {
+ // skip ELEMENTS till source document is back within the tree
+ int count=1; // we have had our first START_ELEMENT
+ while ( count != 0 ) {
+ int token = parser.next();
+ if (token == START_ELEMENT) count++;
+ else if (token == END_ELEMENT) count--;
+ }
}
}
@@ -311,8 +394,9 @@
return true;
}
- /**If there is no value available for a field in a subtag then add a null
- * TODO : needs better explanation
+ /**
+ * A recursive routine that walks the Node tree from a supplied start
+ * pushing a null string onto every multiValued fieldName's List of values.
*/
private void putNulls(Map<String, Object> values) {
if (attributes != null) {
@@ -329,7 +413,11 @@
}
}
- /**Handle multivalued fields by adding List<String>
+ /**
+ * Add the field name and text into the values Map. If it is a non multivalued field, then the text
+ * is simply placed in the object portion of the Map. If it is a
+ * multivalued field then the text is pushed onto a List which is
+ * the object portion of the Map.
*/
@SuppressWarnings("unchecked")
private void putText(Map<String, Object> values, String value,
@@ -346,49 +434,58 @@
}
}
- /**Skip a tag w/o processing the tag or its subtags
- */
- private void skipTag(XMLStreamReader parser) throws IOException,
- XMLStreamException {
- int type;
- while ((type = parser.next()) != END_ELEMENT) {
- if (type == START_ELEMENT)
- skipTag(parser);
- }
- }
- /**Build the node structure from the xpath
- * @param paths the xpaths split by '/'
- * @param fieldName name of the field
- * @param multiValued . is multiValued or not
- * @param record is this xpath a record or a field
- * @param flags extra flags
+ /**
+ * Build a Node tree structure representing all Xpaths of intrest to us.
+ * This must be done before parsing of the XML stream starts. Each node
+ * holds one portion of an Xpath. Taking each Xpath segment in turn this
+ * method walks the Node tree and finds where the new segment should be
+ * inserted. It creates a Node representing a field's name, XPATH and
+ * some flags and inserts the Node into the Node tree.
+ *
*/
- private void build(List<String> paths, String fieldName,
- boolean multiValued, boolean record, int flags) {
- String name = paths.remove(0);
+ private void build(
+ List<String> paths, // a List of segments from the split xpaths
+ String fieldName, // the fieldName assoc with this Xpath
+ boolean multiValued, // flag if this fieldName is multiValued or not
+ boolean record, // is this xpath a record or a field
+ int flags // are we to flatten matching xpaths
+ ) {
+ // recursivly walk the paths Lists adding new Nodes as required
+ String name = paths.remove(0); // shift out next Xpath segment
if (paths.isEmpty() && name.startsWith("@")) {
+ // we have reached end of element portion of Xpath and can now only
+ // have an element attribute. Add it to this nodes list of attributes
if (attributes == null) {
attributes = new ArrayList<Node>();
}
- name = name.substring(1);
+ name = name.substring(1); // strip the '@'
attributes.add(new Node(name, fieldName, multiValued));
} else {
if (childNodes == null)
childNodes = new ArrayList<Node>();
+ // does this "name" already exist as a child node.
Node n = getOrAddChildNode(name);
if (paths.isEmpty()) {
+ // We have reached the end of paths. When parsing the actual
+ // input we have traversed to a position where we actutally have to
+ // do something. getOrAddChildNode() will have created and returned
+ // a new minimal Node with name and xpathName already populated. We
+ // need to add more information
if (record) {
- n.isRecord = true;
- n.forEachPath = fieldName;
+ // forEach attribute
+ n.isRecord = true; // flag: forEach attribute, prepare to emit rec
+ n.forEachPath = fieldName; // the full forEach attribute xpath
} else {
- n.hasText = true;
- n.fieldName = fieldName;
- n.multiValued = multiValued;
- n.flatten = flags == FLATTEN;
+ // xpath with content we want to store and return
+ n.hasText = true; // we have to store text found here
+ n.fieldName = fieldName; // name to store collected text against
+ n.multiValued = multiValued; // true: text be stored in a List
+ n.flatten = flags == FLATTEN; // true: store text from child tags
}
} else {
+ // recurse to handle next paths segment
n.build(paths, fieldName, multiValued, record, flags);
}
}
@@ -398,8 +495,8 @@
for (Node n : childNodes)
if (n.xpathName.equals(xpathName))
return n;
-
- Node n = new Node(xpathName, this);
+ // new territory! add a new node for this Xpath bitty
+ Node n = new Node(xpathName, this); // a minimal Node initalization
Matcher m = ATTRIB_PRESENT_WITHVAL.matcher(xpathName);
if (m.find()) {
n.name = m.group(1);
@@ -419,16 +516,20 @@
childNodes.add(n);
return n;
}
- }
+ } // end of class Node
+
- /**If a field has List then they have to be deep-copied for thread safety
+ /**
+ * Copies a supplied Map to a new Map which is returned. Used to copy a
+ * records values. If a fields value is a List then they have to be
+ * deep-copied for thread safety
*/
private Map<String, Object> getDeepCopy(Map<String, Object> values) {
Map<String, Object> result = new HashMap<String, Object>();
for (Map.Entry<String, Object> entry : values.entrySet()) {
if (entry.getValue() instanceof List) {
result.put(entry.getKey(),new ArrayList((List) entry.getValue()));
- } else{
+ } else {
result.put(entry.getKey(),entry.getValue());
}
}
@@ -436,8 +537,9 @@
}
/**
- * Used for handling cases where there is a slash '/' character
- * inside the attribute value e.g. x@html='text/html'. We need to split
+ * The Xpath is split into segments using the '/' s a seperator. However
+ * this method deals with special cases where there is a slash '/' character
+ * inside the attribute value e.g. x/@html='text/html'. We need to split
* by '/' excluding the '/' which is a part of the attribute's value.
*/
private static List<String> splitEscapeQuote(String str) {
@@ -465,15 +567,16 @@
factory.setProperty(XMLInputFactory.SUPPORT_DTD , Boolean.FALSE);
}
- /**Implement this interface to stream records as and when it is found.
+ /**Implement this interface to stream records as and when one is found.
*
*/
public static interface Handler {
/**
- * @param record The record map . The key is the field name as provided in the addField() methods. The value
- * can be a single String (for single valued) or a List<String> (for multiValued)
- * if an Exception is thrown from this method the parsing will be aborted
- * @param xpath . The forEach XPATH for which this record is being emitted
+ * @param record The record map. The key is the field name as provided in
+ * the addField() methods. The value can be a single String (for single
+ * valued fields) or a List<String> (for multiValued).
+ * @param xpath The forEach XPATH for which this record is being emitted
+ * If there is any change all parsing will be aborted and the Exception is propogated up
*/
public void handle(Map<String, Object> record, String xpath);
}