You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by da...@apache.org on 2011/07/12 05:49:16 UTC
svn commit: r1145443 - in /pig/trunk: CHANGES.txt
contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
Author: daijy
Date: Tue Jul 12 03:49:16 2011
New Revision: 1145443
URL: http://svn.apache.org/viewvc?rev=1145443&view=rev
Log:
PIG-2147: Support nested tags for XMLLoader
Modified:
pig/trunk/CHANGES.txt
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1145443&r1=1145442&r2=1145443&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Tue Jul 12 03:49:16 2011
@@ -66,6 +66,8 @@ PIG-2011: Speed up TestTypedMap.java (dv
BUG FIXES
+PIG-2147: Support nested tags for XMLLoader (vivekp via daijy)
+
PIG-1890: Fix piggybank unit test TestAvroStorage (kengoodhope via daijy)
PIG-2110: NullPointerException in piggybank.evaluation.util.apachelogparser.SearchTermExtractor (dale_jin via daijy)
Modified: pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
URL: http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java?rev=1145443&r1=1145442&r2=1145443&view=diff
==============================================================================
--- pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java (original)
+++ pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java Tue Jul 12 03:49:16 2011
@@ -201,16 +201,29 @@ class XMLLoaderBufferedPositionedInputSt
tag[2+i] = tmp[i];
}
tag[tmp.length+2] = (byte)'>';
+
+
+ // Create a start tag bytes to handle nested tags
+ byte[] startTag = new byte[tmp.length + 1];
+ startTag[0] = (byte)'<';
+ for (int i = 0; i < tmp.length; ++i) {
+ startTag[1+i] = tmp[i];
+ }
+ //startTag[tmp.length+1] = (byte)'>';
+
+
ByteArrayOutputStream collectBuf = new ByteArrayOutputStream(1024);
int idxTagChar = 0;
-
+ int idxStartTagChar = 0;
+ boolean startTagMatched = false;
/*
* Read till an end tag is found.It need not check for any condition since it
* tries to read it till end.One issue that may happen is that if the xml
* content is very huge; or if the end tag is not there in a huge file,
* then it may blow up the memory.
*/
+ int nestedTags = 0;
while (true) {
int b = -1;
try {
@@ -223,15 +236,38 @@ class XMLLoaderBufferedPositionedInputSt
}
collectBuf.write((byte)(b));
+ // Check if the start tag has matched except for the last char
+ if(startTagMatched )
+ {
+ startTagMatched = false;
+ idxStartTagChar = 0;
+ if (b == ' ' || b == '\t' || b == '>')
+ ++nestedTags;// increment the nesting count
+ }
+
+ if (b == startTag[idxStartTagChar]){
+ ++idxStartTagChar;
+ if(idxStartTagChar == startTag.length)
+ startTagMatched = true ; // Set the flag as true if start tag matches
+ }else
+ idxStartTagChar = 0;
+
+
+
// start to match the target close tag
if (b == tag[idxTagChar]) {
++idxTagChar;
if (idxTagChar == tag.length) {
- break;
+ if(nestedTags==0) // Break the loop if there were no nested tags
+ break;
+ else{
+ --nestedTags; // Else decrement the count
+ idxTagChar = 0; // Reset the index
+ }
}
- } else {
- idxTagChar = 0;
- }
+ } else
+ idxTagChar = 0;
+
}
catch (IOException e) {
this.setReadable(false);
Modified: pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
URL: http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java?rev=1145443&r1=1145442&r2=1145443&view=diff
==============================================================================
--- pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java (original)
+++ pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java Tue Jul 12 03:49:16 2011
@@ -18,7 +18,6 @@ import static org.apache.pig.ExecType.LO
import java.io.File;
import java.util.ArrayList;
import java.util.Iterator;
-import java.util.regex.Pattern;
import junit.framework.TestCase;
@@ -28,7 +27,6 @@ import org.apache.pig.data.Tuple;
public class TestXMLLoader extends TestCase {
private static String patternString = "(\\d+)!+(\\w+)~+(\\w+)";
- private final static Pattern pattern = Pattern.compile(patternString);
public static ArrayList<String[]> data = new ArrayList<String[]>();
static {
data.add(new String[] { "<configuration>"});
@@ -44,7 +42,39 @@ public class TestXMLLoader extends TestC
data.add(new String[] { "</property>"});
data.add(new String[] { "</configuration>"});
}
-
+
+ public static ArrayList<String[]> nestedTags = new ArrayList<String[]>();
+ static {
+ nestedTags.add(new String[] { "<events>"});
+ nestedTags.add(new String[] { "<event id='116913365'>"});
+ nestedTags.add(new String[] { "<eventRank>1.000000000000</eventRank>"});
+ nestedTags.add(new String[] { "<name>XY</name>"});
+ nestedTags.add(new String[] { "<relatedEvents>"});
+ nestedTags.add(new String[] { "<event id='116913365'>x</event>"});
+ nestedTags.add(new String[] { "<event id='116913365'>y</event>"});
+ nestedTags.add(new String[] { "</relatedEvents>"});
+ nestedTags.add(new String[] { "</event>"});
+
+ nestedTags.add(new String[] { "<event id='116913365'>"});
+ nestedTags.add(new String[] { "<eventRank>3.0000</eventRank>"});
+ nestedTags.add(new String[] { "<name>AB</name>"});
+ nestedTags.add(new String[] { "<relatedEvents>"});
+ nestedTags.add(new String[] { "<event id='116913365'>a</event>"});
+ nestedTags.add(new String[] { "<event id='116913365'>b</event>"});
+ nestedTags.add(new String[] { "</relatedEvents>"});
+ nestedTags.add(new String[] { "</event>"});
+
+ nestedTags.add(new String[] { "<event>"});
+ nestedTags.add(new String[] { "<eventRank>4.0000</eventRank>"});
+ nestedTags.add(new String[] { "<name>CD</name>"});
+ nestedTags.add(new String[] { "<relatedEvents>"});
+ nestedTags.add(new String[] { "<event>c</event>"});
+ nestedTags.add(new String[] { "<event>d</event>"});
+ nestedTags.add(new String[] { "</relatedEvents>"});
+ nestedTags.add(new String[] { "</event>"});
+ nestedTags.add(new String[] { "</events>"});
+ }
+
public void testShouldReturn0TupleCountIfSearchTagIsNotFound () throws Exception
{
String filename = TestHelper.createTempFile(data, "");
@@ -83,7 +113,6 @@ public class TestXMLLoader extends TestC
if (tuple == null)
break;
else {
- //TestHelper.examineTuple(expected, tuple, tupleCount);
if (tuple.size() > 0) {
tupleCount++;
}
@@ -254,6 +283,7 @@ public class TestXMLLoader extends TestC
}
assertEquals(0, tupleCount);
}
+
public void testShouldReturn0TupleCountIfEmptyFileIsPassed() throws Exception
{
// modify the data content to avoid end tag for </ignoreProperty>
@@ -280,4 +310,28 @@ public class TestXMLLoader extends TestC
assertEquals(0, tupleCount);
}
+ public void testXMLLoaderShouldSupportNestedTagWithSameName() throws Exception {
+
+ String filename = TestHelper.createTempFile(nestedTags, "");
+ PigServer pig = new PigServer(LOCAL);
+ filename = filename.replace("\\", "\\\\");
+ patternString = patternString.replace("\\", "\\\\");
+ String query = "A = LOAD 'file:" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
+ pig.registerQuery(query);
+ Iterator<?> it = pig.openIterator("A");
+ int tupleCount = 0;
+ while (it.hasNext()) {
+ Tuple tuple = (Tuple) it.next();
+ if (tuple == null)
+ break;
+ else {
+ if (tuple.size() > 0) {
+ tupleCount++;
+ }
+ }
+ }
+ assertEquals(3, tupleCount);
+ }
+
+
}