You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by sh...@apache.org on 2009/02/05 20:53:11 UTC
svn commit: r741268 - in /lucene/solr/trunk/contrib/dataimporthandler: ./
src/main/java/org/apache/solr/handler/dataimport/
src/test/java/org/apache/solr/handler/dataimport/
Author: shalin
Date: Thu Feb 5 19:53:10 2009
New Revision: 741268
URL: http://svn.apache.org/viewvc?rev=741268&view=rev
Log:
SOLR-1003 -- XPathEntityprocessor must allow slurping all text from a given xml node and its children
Modified:
lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt
lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java
lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java
lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java
Modified: lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt?rev=741268&r1=741267&r2=741268&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt Thu Feb 5 19:53:10 2009
@@ -59,6 +59,9 @@
13.SOLR-980: A PlainTextEntityProcessor which can read from any DataSource<Reader> and output a String.
(Nathan Adams, Noble Paul via shalin)
+14.SOLR-1003: XPathEntityprocessor must allow slurping all text from a given xml node and its children.
+ (Noble Paul via shalin)
+
Optimizations
----------------------
1. SOLR-846: Reduce memory consumption during delta import by removing keys when used
Modified: lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java?rev=741268&r1=741267&r2=741268&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathEntityProcessor.java Thu Feb 5 19:53:10 2009
@@ -122,9 +122,14 @@
for (Map<String, String> field : context.getAllEntityFields()) {
if (field.get(XPATH) == null)
continue;
+ int flags = 0;
+ if ("true".equals(field.get("flatten"))) {
+ flags = XPathRecordReader.FLATTEN;
+ }
xpathReader.addField(field.get(DataImporter.COLUMN),
- field.get(XPATH), Boolean.parseBoolean(field
- .get(DataImporter.MULTI_VALUED)));
+ field.get(XPATH),
+ Boolean.parseBoolean(field.get(DataImporter.MULTI_VALUED)),
+ flags);
}
} catch (RuntimeException e) {
throw new DataImportHandlerException(SEVERE,
Modified: lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java?rev=741268&r1=741267&r2=741268&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java Thu Feb 5 19:53:10 2009
@@ -39,6 +39,7 @@
*/
public class XPathRecordReader {
private Node rootNode = new Node("/", null);
+ public static final int FLATTEN = 1;
public XPathRecordReader(String forEachXpath) {
String[] splits = forEachXpath.split("\\|");
@@ -46,24 +47,30 @@
split = split.trim();
if (split.length() == 0)
continue;
- addField0(split, split, false, true);
+ addField0(split, split, false, true, 0);
}
}
- public synchronized XPathRecordReader addField(String name, String xpath,
- boolean multiValued) {
+ public synchronized XPathRecordReader addField(String name, String xpath, boolean multiValued) {
if (!xpath.startsWith("/"))
throw new RuntimeException("xpath must start with '/' : " + xpath);
- addField0(xpath, name, multiValued, false);
+ addField0(xpath, name, multiValued, false, 0);
+ return this;
+ }
+
+ public synchronized XPathRecordReader addField(String name, String xpath, boolean multiValued, int flags) {
+ if (!xpath.startsWith("/"))
+ throw new RuntimeException("xpath must start with '/' : " + xpath);
+ addField0(xpath, name, multiValued, false, flags);
return this;
}
private void addField0(String xpath, String name, boolean multiValued,
- boolean isRecord) {
+ boolean isRecord, int flags) {
List<String> paths = new LinkedList<String>(Arrays.asList(xpath.split("/")));
if ("".equals(paths.get(0).trim()))
paths.remove(0);
- rootNode.build(paths, name, multiValued, isRecord);
+ rootNode.build(paths, name, multiValued, isRecord, flags);
}
public List<Map<String, Object>> getAllRecords(Reader r) {
@@ -97,6 +104,8 @@
boolean hasText = false, multiValued = false, isRecord = false;
+ private boolean flatten;
+
public Node(String name, Node p) {
xpathName = this.name = name;
parent = p;
@@ -167,7 +176,22 @@
if(event == CDATA || event == CHARACTERS || event == SPACE) {
text = text + parser.getText();
} else if(event == START_ELEMENT) {
- handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
+ if (flatten) {
+ int starts = 1;
+ while (true) {
+ event = parser.next();
+ if (event == CDATA || event == CHARACTERS || event == SPACE) {
+ text = text + parser.getText();
+ } else if (event == START_ELEMENT) {
+ starts++;
+ } else if (event == END_ELEMENT) {
+ starts--;
+ if (starts == 0) break;
+ }
+ }
+ } else {
+ handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
+ }
} else {
break;
}
@@ -275,7 +299,7 @@
}
public void build(List<String> paths, String fieldName,
- boolean multiValued, boolean record) {
+ boolean multiValued, boolean record, int flags) {
String name = paths.remove(0);
if (paths.isEmpty() && name.startsWith("@")) {
if (attributes == null) {
@@ -296,9 +320,10 @@
n.hasText = true;
n.fieldName = fieldName;
n.multiValued = multiValued;
+ n.flatten = flags == FLATTEN;
}
} else {
- n.build(paths, fieldName, multiValued, record);
+ n.build(paths, fieldName, multiValued, record, flags);
}
}
}
Modified: lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java?rev=741268&r1=741267&r2=741268&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java Thu Feb 5 19:53:10 2009
@@ -152,7 +152,23 @@
Assert.assertTrue(p.contains("This text is"));
Assert.assertTrue(p.contains("and this text is"));
Assert.assertTrue(p.contains("!"));
+ // Should not contain content from child elements
+ Assert.assertFalse(p.contains("bold"));
+ }
+ @Test
+ public void mixedContentFlattened() {
+ String xml = "<xhtml:p xmlns:xhtml=\"http://xhtml.com/\" >This text is \n" +
+ " <xhtml:b>bold</xhtml:b> and this text is \n" +
+ " <xhtml:u>underlined</xhtml:u>!\n" +
+ "</xhtml:p>";
+ XPathRecordReader rr = new XPathRecordReader("/p");
+ rr.addField("p", "/p", false, XPathRecordReader.FLATTEN);
+ List<Map<String, Object>> l = rr.getAllRecords(new StringReader(xml));
+ Map<String, Object> row = l.get(0);
+ Assert.assertEquals("This text is \n" +
+ " bold and this text is \n" +
+ " underlined!", ((String)row.get("p")).trim() );
}
@Test