You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by da...@apache.org on 2014/10/24 22:59:00 UTC

svn commit: r1634137 - in /pig/trunk: CHANGES.txt contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java

Author: daijy
Date: Fri Oct 24 20:59:00 2014
New Revision: 1634137

URL: http://svn.apache.org/r1634137
Log:
PIG-4242: For indented xmls with multiline content (e.g. wikipedia) XMLLoader cuts out the begining of every line

Modified:
    pig/trunk/CHANGES.txt
    pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
    pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java

Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1634137&r1=1634136&r2=1634137&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Fri Oct 24 20:59:00 2014
@@ -26,7 +26,8 @@ IMPROVEMENTS
  
 BUG FIXES
 
-
+PIG-4242: For indented xmls with multiline content (e.g. wikipedia) XMLLoader cuts out the begining of every line
+ (holdfenytolvaj via daijy)
 
 Release 0.14.0 - Unreleased
  

Modified: pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
URL: http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java?rev=1634137&r1=1634136&r2=1634137&view=diff
==============================================================================
--- pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java (original)
+++ pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java Fri Oct 24 20:59:00 2014
@@ -150,11 +150,12 @@ public class XMLLoader extends LoadFunc 
       // In case of an tag matched with an open tag and a closed tag, this buffer
       // is used to accumulate matched element if it is spans multiple lines.
       StringBuffer currentMatch = new StringBuffer();
-      // The start offset of first matched open tag. This marks the first byte
-      // in the range to be copied to output.
-      int offsetOfFirstMatchedOpenTag = 0;
       try {
       while (true) {
+          // The start offset of first matched open tag. This marks the first byte
+          // in the range to be copied to output.
+          int offsetOfFirstMatchedOpenTag = 0;
+    	  
 	while (buffer == null || buffer.length() == 0) {
 	  if (!wrapped.nextKeyValue())
 	    return false; // End of split

Modified: pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
URL: http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java?rev=1634137&r1=1634136&r2=1634137&view=diff
==============================================================================
--- pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java (original)
+++ pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java Fri Oct 24 20:59:00 2014
@@ -99,6 +99,12 @@ public class TestXMLLoader extends TestC
     inlineClosedTags.add(new String[] { "</events>"});
   }
 
+    public static ArrayList<String[]> indentedXmlWithMultilineLineContent = new ArrayList<String[]>();
+    static {
+        indentedXmlWithMultilineLineContent.add(new String[] { "    <page>You have " });
+        indentedXmlWithMultilineLineContent.add(new String[] { "not missed it</page>" });
+    }
+
   public void testShouldReturn0TupleCountIfSearchTagIsNotFound () throws Exception {
     String filename = TestHelper.createTempFile(data, "");
     PigServer pig = new PigServer(LOCAL);
@@ -354,6 +360,27 @@ public class TestXMLLoader extends TestC
      assertEquals(4, tupleCount);
    }
 
+    public void testXMLLoaderShouldWorkWithIndentedXmlWithMultilineContent() throws Exception {
+        String filename = TestHelper.createTempFile(indentedXmlWithMultilineLineContent, "");
+        PigServer pig = new PigServer(LOCAL);
+        filename = filename.replace("\\", "\\\\");
+        String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('page') as (doc:chararray);";
+        pig.registerQuery(query);
+        Iterator<?> it = pig.openIterator("A");
+        int tupleCount = 0;
+        while (it.hasNext()) {
+            Tuple tuple = (Tuple) it.next();
+            if (tuple == null)
+                break;
+            else {
+                System.out.println(((String) tuple.get(0)));
+                assertTrue(((String) tuple.get(0)).equals("<page>You have not missed it</page>"));
+                tupleCount++;
+            }
+        }
+        assertEquals(1, tupleCount);
+    }
+
    public void testXMLLoaderShouldReturnValidXML() throws Exception {
      String filename = TestHelper.createTempFile(inlineClosedTags, "");
      PigServer pig = new PigServer(LOCAL);