You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by da...@apache.org on 2014/10/24 22:59:00 UTC
svn commit: r1634137 - in /pig/trunk: CHANGES.txt
contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
Author: daijy
Date: Fri Oct 24 20:59:00 2014
New Revision: 1634137
URL: http://svn.apache.org/r1634137
Log:
PIG-4242: For indented xmls with multiline content (e.g. wikipedia) XMLLoader cuts out the begining of every line
Modified:
pig/trunk/CHANGES.txt
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1634137&r1=1634136&r2=1634137&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Fri Oct 24 20:59:00 2014
@@ -26,7 +26,8 @@ IMPROVEMENTS
BUG FIXES
-
+PIG-4242: For indented xmls with multiline content (e.g. wikipedia) XMLLoader cuts out the begining of every line
+ (holdfenytolvaj via daijy)
Release 0.14.0 - Unreleased
Modified: pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
URL: http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java?rev=1634137&r1=1634136&r2=1634137&view=diff
==============================================================================
--- pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java (original)
+++ pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java Fri Oct 24 20:59:00 2014
@@ -150,11 +150,12 @@ public class XMLLoader extends LoadFunc
// In case of an tag matched with an open tag and a closed tag, this buffer
// is used to accumulate matched element if it is spans multiple lines.
StringBuffer currentMatch = new StringBuffer();
- // The start offset of first matched open tag. This marks the first byte
- // in the range to be copied to output.
- int offsetOfFirstMatchedOpenTag = 0;
try {
while (true) {
+ // The start offset of first matched open tag. This marks the first byte
+ // in the range to be copied to output.
+ int offsetOfFirstMatchedOpenTag = 0;
+
while (buffer == null || buffer.length() == 0) {
if (!wrapped.nextKeyValue())
return false; // End of split
Modified: pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
URL: http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java?rev=1634137&r1=1634136&r2=1634137&view=diff
==============================================================================
--- pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java (original)
+++ pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java Fri Oct 24 20:59:00 2014
@@ -99,6 +99,12 @@ public class TestXMLLoader extends TestC
inlineClosedTags.add(new String[] { "</events>"});
}
+ public static ArrayList<String[]> indentedXmlWithMultilineLineContent = new ArrayList<String[]>();
+ static {
+ indentedXmlWithMultilineLineContent.add(new String[] { " <page>You have " });
+ indentedXmlWithMultilineLineContent.add(new String[] { "not missed it</page>" });
+ }
+
public void testShouldReturn0TupleCountIfSearchTagIsNotFound () throws Exception {
String filename = TestHelper.createTempFile(data, "");
PigServer pig = new PigServer(LOCAL);
@@ -354,6 +360,27 @@ public class TestXMLLoader extends TestC
assertEquals(4, tupleCount);
}
+ public void testXMLLoaderShouldWorkWithIndentedXmlWithMultilineContent() throws Exception {
+ String filename = TestHelper.createTempFile(indentedXmlWithMultilineLineContent, "");
+ PigServer pig = new PigServer(LOCAL);
+ filename = filename.replace("\\", "\\\\");
+ String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('page') as (doc:chararray);";
+ pig.registerQuery(query);
+ Iterator<?> it = pig.openIterator("A");
+ int tupleCount = 0;
+ while (it.hasNext()) {
+ Tuple tuple = (Tuple) it.next();
+ if (tuple == null)
+ break;
+ else {
+ System.out.println(((String) tuple.get(0)));
+ assertTrue(((String) tuple.get(0)).equals("<page>You have not missed it</page>"));
+ tupleCount++;
+ }
+ }
+ assertEquals(1, tupleCount);
+ }
+
public void testXMLLoaderShouldReturnValidXML() throws Exception {
String filename = TestHelper.createTempFile(inlineClosedTags, "");
PigServer pig = new PigServer(LOCAL);