You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by sh...@apache.org on 2009/02/05 20:53:11 UTC

svn commit: r741268 - in /lucene/solr/trunk/contrib/dataimporthandler: ./ src/main/java/org/apache/solr/handler/dataimport/ src/test/java/org/apache/solr/handler/dataimport/

Author: shalin
Date: Thu Feb  5 19:53:10 2009
New Revision: 741268

URL: http://svn.apache.org/viewvc?rev=741268&view=rev
Log:
SOLR-1003 -- XPathEntityprocessor must allow slurping all text from a given xml node and its children

Modified:
    lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt
    lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java
    lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java
    lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java

Modified: lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt?rev=741268&r1=741267&r2=741268&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt Thu Feb  5 19:53:10 2009
@@ -59,6 +59,9 @@
 13.SOLR-980:  A PlainTextEntityProcessor which can read from any DataSource<Reader> and output a String.
               (Nathan Adams, Noble Paul via shalin)
 
+14.SOLR-1003: XPathEntityprocessor must allow slurping all text from a given xml node and its children.
+              (Noble Paul via shalin)
+
 Optimizations
 ----------------------
 1. SOLR-846:  Reduce memory consumption during delta import by removing keys when used

Modified: lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java?rev=741268&r1=741267&r2=741268&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java Thu Feb  5 19:53:10 2009
@@ -122,9 +122,14 @@
         for (Map<String, String> field : context.getAllEntityFields()) {
           if (field.get(XPATH) == null)
             continue;
+          int flags = 0;
+          if ("true".equals(field.get("flatten"))) {
+            flags = XPathRecordReader.FLATTEN;
+          }
           xpathReader.addField(field.get(DataImporter.COLUMN),
-                  field.get(XPATH), Boolean.parseBoolean(field
-                          .get(DataImporter.MULTI_VALUED)));
+                  field.get(XPATH),
+                  Boolean.parseBoolean(field.get(DataImporter.MULTI_VALUED)),
+                  flags);
         }
       } catch (RuntimeException e) {
         throw new DataImportHandlerException(SEVERE,

Modified: lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java?rev=741268&r1=741267&r2=741268&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java Thu Feb  5 19:53:10 2009
@@ -39,6 +39,7 @@
  */
 public class XPathRecordReader {
   private Node rootNode = new Node("/", null);
+  public static final int FLATTEN = 1;
 
   public XPathRecordReader(String forEachXpath) {
     String[] splits = forEachXpath.split("\\|");
@@ -46,24 +47,30 @@
       split = split.trim();
       if (split.length() == 0)
         continue;
-      addField0(split, split, false, true);
+      addField0(split, split, false, true, 0);
     }
   }
 
-  public synchronized XPathRecordReader addField(String name, String xpath,
-                                                 boolean multiValued) {
+  public synchronized XPathRecordReader addField(String name, String xpath, boolean multiValued) {
     if (!xpath.startsWith("/"))
       throw new RuntimeException("xpath must start with '/' : " + xpath);
-    addField0(xpath, name, multiValued, false);
+    addField0(xpath, name, multiValued, false, 0);
+    return this;
+  }
+
+  public synchronized XPathRecordReader addField(String name, String xpath, boolean multiValued, int flags) {
+    if (!xpath.startsWith("/"))
+      throw new RuntimeException("xpath must start with '/' : " + xpath);
+    addField0(xpath, name, multiValued, false, flags);
     return this;
   }
 
   private void addField0(String xpath, String name, boolean multiValued,
-                         boolean isRecord) {
+                         boolean isRecord, int flags) {
     List<String> paths = new LinkedList<String>(Arrays.asList(xpath.split("/")));
     if ("".equals(paths.get(0).trim()))
       paths.remove(0);
-    rootNode.build(paths, name, multiValued, isRecord);
+    rootNode.build(paths, name, multiValued, isRecord, flags);
   }
 
   public List<Map<String, Object>> getAllRecords(Reader r) {
@@ -97,6 +104,8 @@
 
     boolean hasText = false, multiValued = false, isRecord = false;
 
+    private boolean flatten;
+
     public Node(String name, Node p) {
       xpathName = this.name = name;
       parent = p;
@@ -167,7 +176,22 @@
               if(event == CDATA || event == CHARACTERS || event == SPACE) {
                 text = text + parser.getText();
               } else if(event == START_ELEMENT) {
-                handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
+                if (flatten) {
+                  int starts = 1;
+                  while (true) {
+                    event = parser.next();
+                    if (event == CDATA || event == CHARACTERS || event == SPACE) {
+                      text = text + parser.getText();
+                    } else if (event == START_ELEMENT) {
+                      starts++;
+                    } else if (event == END_ELEMENT) {
+                      starts--;
+                      if (starts == 0) break;
+                    }
+                  }
+                } else {
+                  handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
+                }
               } else {
                 break;
               }
@@ -275,7 +299,7 @@
     }
 
     public void build(List<String> paths, String fieldName,
-                      boolean multiValued, boolean record) {
+                      boolean multiValued, boolean record, int flags) {
       String name = paths.remove(0);
       if (paths.isEmpty() && name.startsWith("@")) {
         if (attributes == null) {
@@ -296,9 +320,10 @@
             n.hasText = true;
             n.fieldName = fieldName;
             n.multiValued = multiValued;
+            n.flatten = flags == FLATTEN;
           }
         } else {
-          n.build(paths, fieldName, multiValued, record);
+          n.build(paths, fieldName, multiValued, record, flags);
         }
       }
     }

Modified: lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java?rev=741268&r1=741267&r2=741268&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java Thu Feb  5 19:53:10 2009
@@ -152,7 +152,23 @@
     Assert.assertTrue(p.contains("This text is"));
     Assert.assertTrue(p.contains("and this text is"));
     Assert.assertTrue(p.contains("!"));
+    // Should not contain content from child elements
+    Assert.assertFalse(p.contains("bold"));
+  }
 
+  @Test
+  public void mixedContentFlattened() {
+    String xml = "<xhtml:p xmlns:xhtml=\"http://xhtml.com/\" >This text is \n" +
+            "  <xhtml:b>bold</xhtml:b> and this text is \n" +
+            "  <xhtml:u>underlined</xhtml:u>!\n" +
+            "</xhtml:p>";
+    XPathRecordReader rr = new XPathRecordReader("/p");
+    rr.addField("p", "/p", false, XPathRecordReader.FLATTEN);
+    List<Map<String, Object>> l = rr.getAllRecords(new StringReader(xml));
+    Map<String, Object> row = l.get(0);
+    Assert.assertEquals("This text is \n" +
+            "  bold and this text is \n" +
+            "  underlined!", ((String)row.get("p")).trim() );
   }
 
   @Test