You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by da...@apache.org on 2011/07/12 05:49:16 UTC

svn commit: r1145443 - in /pig/trunk: CHANGES.txt contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java

Author: daijy
Date: Tue Jul 12 03:49:16 2011
New Revision: 1145443

URL: http://svn.apache.org/viewvc?rev=1145443&view=rev
Log:
PIG-2147: Support nested tags for XMLLoader

Modified:
    pig/trunk/CHANGES.txt
    pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
    pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java

Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1145443&r1=1145442&r2=1145443&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Tue Jul 12 03:49:16 2011
@@ -66,6 +66,8 @@ PIG-2011: Speed up TestTypedMap.java (dv
 
 BUG FIXES
 
+PIG-2147: Support nested tags for XMLLoader (vivekp via daijy)
+
 PIG-1890: Fix piggybank unit test TestAvroStorage (kengoodhope via daijy)
 
 PIG-2110: NullPointerException in piggybank.evaluation.util.apachelogparser.SearchTermExtractor (dale_jin via daijy)

Modified: pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
URL: http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java?rev=1145443&r1=1145442&r2=1145443&view=diff
==============================================================================
--- pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java (original)
+++ pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java Tue Jul 12 03:49:16 2011
@@ -201,16 +201,29 @@ class XMLLoaderBufferedPositionedInputSt
         tag[2+i] = tmp[i];
       }
       tag[tmp.length+2] = (byte)'>';
+      
+      
+      // Create a start tag bytes to handle nested tags
+      byte[] startTag = new byte[tmp.length + 1];
+      startTag[0] = (byte)'<';
+      for (int i = 0; i < tmp.length; ++i) {
+         startTag[1+i] = tmp[i];
+       }
+      //startTag[tmp.length+1] = (byte)'>';
+      
+      
 
       ByteArrayOutputStream collectBuf = new ByteArrayOutputStream(1024);
       int idxTagChar = 0;
-      
+      int idxStartTagChar = 0;
+      boolean startTagMatched = false;
       /*
        * Read till an end tag is found.It need not check for any condition since it 
        * tries to read it till end.One issue that may happen is that if the xml 
        * content is very huge; or if the end tag is not there in a huge file, 
        * then it may blow up the memory. 
        */
+      int nestedTags = 0;
       while (true) {
         int b = -1;
         try {
@@ -223,15 +236,38 @@ class XMLLoaderBufferedPositionedInputSt
           }
           collectBuf.write((byte)(b));
 
+          // Check if the start tag has matched except for the last char
+          if(startTagMatched )
+          {
+             startTagMatched = false;
+             idxStartTagChar = 0;
+             if (b == ' ' || b == '\t' || b == '>')
+                ++nestedTags;// increment the nesting count
+          }
+          
+          if (b == startTag[idxStartTagChar]){
+             ++idxStartTagChar;
+             if(idxStartTagChar == startTag.length)
+                startTagMatched = true ; // Set the flag as true if start tag matches
+          }else
+             idxStartTagChar = 0;
+            
+          
+          
           // start to match the target close tag
           if (b == tag[idxTagChar]) {
             ++idxTagChar;
             if (idxTagChar == tag.length) {
-              break;
+               if(nestedTags==0) // Break the loop if there were no nested tags
+                  break;
+               else{
+                  --nestedTags; // Else decrement the count
+                  idxTagChar = 0; // Reset the index
+               }
             }
-          } else {
-            idxTagChar = 0;
-          }
+          } else 
+            idxTagChar = 0; 
+          
         }
         catch (IOException e) {
           this.setReadable(false);

Modified: pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
URL: http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java?rev=1145443&r1=1145442&r2=1145443&view=diff
==============================================================================
--- pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java (original)
+++ pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java Tue Jul 12 03:49:16 2011
@@ -18,7 +18,6 @@ import static org.apache.pig.ExecType.LO
 import java.io.File;
 import java.util.ArrayList;
 import java.util.Iterator;
-import java.util.regex.Pattern;
 
 import junit.framework.TestCase;
 
@@ -28,7 +27,6 @@ import org.apache.pig.data.Tuple;
 
 public class TestXMLLoader extends TestCase {
   private static String patternString = "(\\d+)!+(\\w+)~+(\\w+)";
-  private final static Pattern pattern = Pattern.compile(patternString);
   public static ArrayList<String[]> data = new ArrayList<String[]>();
   static {
     data.add(new String[] { "<configuration>"});
@@ -44,7 +42,39 @@ public class TestXMLLoader extends TestC
     data.add(new String[] { "</property>"});
     data.add(new String[] { "</configuration>"});
   }
-
+  
+  public static ArrayList<String[]> nestedTags = new ArrayList<String[]>();
+  static {
+     nestedTags.add(new String[] { "<events>"});
+     nestedTags.add(new String[] { "<event id='116913365'>"});
+     nestedTags.add(new String[] { "<eventRank>1.000000000000</eventRank>"});
+     nestedTags.add(new String[] { "<name>XY</name>"});   
+     nestedTags.add(new String[] { "<relatedEvents>"});
+     nestedTags.add(new String[] { "<event id='116913365'>x</event>"});
+     nestedTags.add(new String[] { "<event id='116913365'>y</event>"});
+     nestedTags.add(new String[] { "</relatedEvents>"});
+     nestedTags.add(new String[] { "</event>"});
+    
+     nestedTags.add(new String[] { "<event id='116913365'>"});
+     nestedTags.add(new String[] { "<eventRank>3.0000</eventRank>"});
+     nestedTags.add(new String[] { "<name>AB</name>"});   
+     nestedTags.add(new String[] { "<relatedEvents>"});
+     nestedTags.add(new String[] { "<event id='116913365'>a</event>"});
+     nestedTags.add(new String[] { "<event id='116913365'>b</event>"});
+     nestedTags.add(new String[] { "</relatedEvents>"});
+     nestedTags.add(new String[] { "</event>"});
+     
+     nestedTags.add(new String[] { "<event>"});
+     nestedTags.add(new String[] { "<eventRank>4.0000</eventRank>"});
+     nestedTags.add(new String[] { "<name>CD</name>"});   
+     nestedTags.add(new String[] { "<relatedEvents>"});
+     nestedTags.add(new String[] { "<event>c</event>"});
+     nestedTags.add(new String[] { "<event>d</event>"});
+     nestedTags.add(new String[] { "</relatedEvents>"});
+     nestedTags.add(new String[] { "</event>"});
+     nestedTags.add(new String[] { "</events>"});
+  }
+  
   public void testShouldReturn0TupleCountIfSearchTagIsNotFound () throws Exception
   {
     String filename = TestHelper.createTempFile(data, "");
@@ -83,7 +113,6 @@ public class TestXMLLoader extends TestC
       if (tuple == null)
         break;
       else {
-        //TestHelper.examineTuple(expected, tuple, tupleCount);
         if (tuple.size() > 0) {
             tupleCount++;
         }
@@ -254,6 +283,7 @@ public class TestXMLLoader extends TestC
       }
       assertEquals(0, tupleCount);  
    }
+   
    public void testShouldReturn0TupleCountIfEmptyFileIsPassed() throws Exception
    {
       // modify the data content to avoid end tag for </ignoreProperty>
@@ -280,4 +310,28 @@ public class TestXMLLoader extends TestC
       assertEquals(0, tupleCount);  
    }
    
+   public void testXMLLoaderShouldSupportNestedTagWithSameName() throws Exception {
+      
+      String filename = TestHelper.createTempFile(nestedTags, "");
+      PigServer pig = new PigServer(LOCAL);
+      filename = filename.replace("\\", "\\\\");
+      patternString = patternString.replace("\\", "\\\\");
+      String query = "A = LOAD 'file:" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
+      pig.registerQuery(query);
+      Iterator<?> it = pig.openIterator("A");
+      int tupleCount = 0;
+      while (it.hasNext()) {
+        Tuple tuple = (Tuple) it.next();
+        if (tuple == null)
+          break;
+        else {
+          if (tuple.size() > 0) {
+              tupleCount++;
+          }
+        }
+      }
+      assertEquals(3, tupleCount);  
+   }
+   
+   
 }