You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by sh...@apache.org on 2009/02/02 12:30:23 UTC

svn commit: r739962 - in /lucene/solr/trunk/contrib/dataimporthandler: CHANGES.txt src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java

Author: shalin
Date: Mon Feb  2 11:30:18 2009
New Revision: 739962

URL: http://svn.apache.org/viewvc?rev=739962&view=rev
Log:
SOLR-999 -- XPathRecordReader fails on XMLs with nodes mixed with CDATA content

Modified:
    lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt
    lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java
    lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java

Modified: lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt?rev=739962&r1=739961&r2=739962&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt Mon Feb  2 11:30:18 2009
@@ -106,6 +106,9 @@
 13. SOLR-985: Fix thread-safety issue with TemplateString for concurrent imports with multiple cores.
               (Ryuuichi Kumai via shalin)
 
+14. SOLR-999: XPathRecordReader fails on XMLs with nodes mixed with CDATA content.
+              (Fergus McMenemie, Noble Paul via shalin)
+
 Documentation
 ----------------------
 

Modified: lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java?rev=739962&r1=739961&r2=739962&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java Mon Feb  2 11:30:18 2009
@@ -162,19 +162,20 @@
             skipNextEvent = true;
             String text = parser.getText();
             event = parser.next();
-            while (event == CDATA || event == CHARACTERS || event == SPACE) {
-              text = text + parser.getText();
+
+            while (true) {
+              if(event == CDATA || event == CHARACTERS || event == SPACE) {
+                text = text + parser.getText();
+              } else if(event == START_ELEMENT) {
+                handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
+              } else {
+                break;
+              }
               event = parser.next();
             }
             putText(values, text, fieldName, multiValued);
           } else if (event == START_ELEMENT) {
-            Node n = getMatchingChild(parser);
-            if (n != null) {
-              childrenFound.add(n);
-              n.parse(parser, handler, values, stack, recordStarted);
-            } else {
-              skipTag(parser);
-            }
+            handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
           }
         }
       } finally {
@@ -193,6 +194,19 @@
       }
     }
 
+    private void handleStartElement(XMLStreamReader parser, Set<Node> childrenFound,
+                                    Handler handler, Map<String, Object> values,
+                                    Stack<Set<String>> stack, boolean recordStarted)
+            throws IOException, XMLStreamException {
+      Node n = getMatchingChild(parser);
+      if (n != null) {
+        childrenFound.add(n);
+        n.parse(parser, handler, values, stack, recordStarted);
+      } else {
+        skipTag(parser);
+      }
+    }
+
     private Node getMatchingChild(XMLStreamReader parser) {
       if (childNodes == null)
         return null;

Modified: lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java?rev=739962&r1=739961&r2=739962&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java Mon Feb  2 11:30:18 2009
@@ -25,9 +25,7 @@
 import java.util.Map;
 
 /**
- * <p>
- * Test for XPathRecordReader
- * </p>
+ * <p> Test for XPathRecordReader </p>
  *
  * @version $Id$
  * @since solr 1.3
@@ -136,6 +134,28 @@
   }
 
   @Test
+  public void mixedContent() {
+    String xml = "<xhtml:p xmlns:xhtml=\"http://xhtml.com/\" >This text is \n" +
+            "  <xhtml:b>bold</xhtml:b> and this text is \n" +
+            "  <xhtml:u>underlined</xhtml:u>!\n" +
+            "</xhtml:p>";
+    XPathRecordReader rr = new XPathRecordReader("/p");
+    rr.addField("p", "/p", true);
+    rr.addField("b", "/p/b", true);
+    rr.addField("u", "/p/u", true);
+    List<Map<String, Object>> l = rr.getAllRecords(new StringReader(xml));
+    Map<String, Object> row = l.get(0);
+
+    Assert.assertEquals("bold", ((List) row.get("b")).get(0));
+    Assert.assertEquals("underlined", ((List) row.get("u")).get(0));
+    String p = (String) ((List) row.get("p")).get(0);
+    Assert.assertTrue(p.contains("This text is"));
+    Assert.assertTrue(p.contains("and this text is"));
+    Assert.assertTrue(p.contains("!"));
+
+  }
+
+  @Test
   public void elems2LevelWithAttrib() {
     String xml = "<root>\n" + "\t<a>\n" + "\t   <b k=\"x\">\n"
             + "\t      <x>x0</x>\n" + "\t      <y>y0</y>\n" + "\t   </b>\n"