You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by da...@apache.org on 2014/09/26 06:57:34 UTC

svn commit: r1627713 - in /pig/trunk: ./ src/docs/src/documentation/content/xdocs/ src/org/apache/pig/builtin/ test/org/apache/pig/test/

Author: daijy
Date: Fri Sep 26 04:57:34 2014
New Revision: 1627713

URL: http://svn.apache.org/r1627713
Log:
PIG-3870: STRSPLITTOBAG UDF

Added:
    pig/trunk/src/org/apache/pig/builtin/STRSPLITTOBAG.java
Modified:
    pig/trunk/CHANGES.txt
    pig/trunk/src/docs/src/documentation/content/xdocs/func.xml
    pig/trunk/src/docs/src/documentation/content/xdocs/pig-index.xml
    pig/trunk/test/org/apache/pig/test/TestBuiltin.java
    pig/trunk/test/org/apache/pig/test/TestStringUDFs.java

Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1627713&r1=1627712&r2=1627713&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Fri Sep 26 04:57:34 2014
@@ -24,6 +24,8 @@ INCOMPATIBLE CHANGES
  
 IMPROVEMENTS
 
+PIG-3870: STRSPLITTOBAG UDF (cryptoe via daijy)
+
 PIG-4080: Add Preprocessor commands and more to the black/whitelisting feature (prkommireddi via daijy)
 
 PIG-4162: Intermediate reducer parallelism in Tez should be higher (rohini)

Modified: pig/trunk/src/docs/src/documentation/content/xdocs/func.xml
URL: http://svn.apache.org/viewvc/pig/trunk/src/docs/src/documentation/content/xdocs/func.xml?rev=1627713&r1=1627712&r2=1627713&view=diff
==============================================================================
--- pig/trunk/src/docs/src/documentation/content/xdocs/func.xml (original)
+++ pig/trunk/src/docs/src/documentation/content/xdocs/func.xml Fri Sep 26 04:57:34 2014
@@ -4358,7 +4358,70 @@ For example, given the string (open:sour
 </section>
 </section> 
 
-<!-- ======================================================== -->  
+<!-- ======================================================== -->
+    <section id="strsplittobag">
+        <title>STRSPLITTOBAG</title>
+        <p>Splits a string around matches of a given regular expression and returns a databag</p>
+        <section>
+            <title>Syntax</title>
+            <table>
+                <tr>
+                    <td>
+                        <p>STRSPLITTOBAG(string, regex, limit)</p>
+                    </td>
+                </tr>
+            </table>
+        </section>
+        <section>
+            <title>Terms</title>
+            <table>
+                <tr>
+                    <td>
+                        <p>string</p>
+                    </td>
+                    <td>
+                        <p>The string to be split.</p>
+                    </td>
+                </tr>
+                <tr>
+                    <td>
+                        <p>regex</p>
+                    </td>
+                    <td>
+                        <p>The regular expression.</p>
+                    </td>
+                </tr>
+                <tr>
+                    <td>
+                        <p>limit</p>
+                    </td>
+                    <td>
+                        <p>If the value is positive, the pattern (the compiled representation of the regular expression)
+                            is applied at most limit-1 times, therefore the value of the argument means the maximum size
+                            of the result bag. The last tuple of the result bag will contain all input after the last
+                            match.
+                        </p>
+                        <p>If the value is negative, no limit is applied to the size of the result bag.</p>
+                        <p>If the value is zero, no limit is applied to the size of the result bag too, and trailing
+                            empty strings (if any) will be removed.
+                        </p>
+                    </td>
+                </tr>
+            </table>
+        </section>
+        <section>
+            <title>Usage</title>
+            <p>
+                Use the STRSPLITTOBAG function to split a string around matches of a given regular expression.
+            </p>
+            <p>
+                For example, given the string (open:source:software), STRSPLITTOBAG (string, ':',2) will return
+                {(open),(source:software)} and STRSPLITTOBAG (string, ':',3) will return {(open),(source),(software)}.
+            </p>
+        </section>
+    </section>
+
+    <!-- ======================================================== -->
  <section id="substring">
    <title>SUBSTRING</title>
    <p>Returns a substring from a given string. </p>

Modified: pig/trunk/src/docs/src/documentation/content/xdocs/pig-index.xml
URL: http://svn.apache.org/viewvc/pig/trunk/src/docs/src/documentation/content/xdocs/pig-index.xml?rev=1627713&r1=1627712&r2=1627713&view=diff
==============================================================================
--- pig/trunk/src/docs/src/documentation/content/xdocs/pig-index.xml (original)
+++ pig/trunk/src/docs/src/documentation/content/xdocs/pig-index.xml Fri Sep 26 04:57:34 2014
@@ -964,6 +964,8 @@
 
 <p><a href="func.html#strsplit">STRSPLIT</a> function</p>
 
+<p><a href="func.html#strsplittobag">STRSPLITTOBAG</a> function</p>
+
 <p><a href="func.html#substring">SUBSTRING</a> function</p>
 
 <p><a href="func.html#sum">SUM</a> function</p>

Added: pig/trunk/src/org/apache/pig/builtin/STRSPLITTOBAG.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/builtin/STRSPLITTOBAG.java?rev=1627713&view=auto
==============================================================================
--- pig/trunk/src/org/apache/pig/builtin/STRSPLITTOBAG.java (added)
+++ pig/trunk/src/org/apache/pig/builtin/STRSPLITTOBAG.java Fri Sep 26 04:57:34 2014
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.builtin;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.FuncSpec;
+import org.apache.pig.PigWarning;
+import org.apache.pig.data.*;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.PatternSyntaxException;
+
+/**
+ * Wrapper around Java's String.split<br>
+ * input tuple: first column is assumed to have a string to split;<br>
+ * the optional second column is assumed to have the delimiter or regex to split on;<br>
+ * if not provided, it's assumed to be '\s' (space)<br>
+ * the optional third column may provide a limit to the number of results.<br>
+ * If limit is not provided, 0 is assumed, as per Java's split().
+ */
+
+public class STRSPLITTOBAG extends EvalFunc<DataBag> {
+
+    private final static BagFactory bagFactory = BagFactory.getInstance();
+    private final static TupleFactory tupleFactory = TupleFactory.getInstance();
+
+    /**
+     * Wrapper around Java's String.split
+     *
+     * @param input tuple; first column is assumed to have a string to split;
+     *              the optional second column is assumed to have the delimiter or regex to split on;<br>
+     *              if not provided, it's assumed to be '\s' (space)
+     *              the optional third column may provide a limit to the number of results.<br>
+     *              If limit is not provided, 0 is assumed, as per Java's split().
+     * @throws java.io.IOException
+     */
+    @Override
+    public DataBag exec(Tuple input) throws IOException {
+        if (input == null || input.size() < 1) {
+            return null;
+        }
+        try {
+            String source = (String) input.get(0);
+            String delim = (input.size() > 1) ? (String) input.get(1) : "\\s";
+            int length = (input.size() > 2) ? (Integer) input.get(2) : 0;
+            if (source == null || delim == null) {
+                return null;
+            }
+
+            String[] splits = source.split(delim, length);
+            DataBag dataBag = bagFactory.newDefaultBag();
+            for (String eachSplit : splits) {
+                Tuple tuple = tupleFactory.newTuple(1);
+                tuple.set(0, eachSplit);
+                dataBag.add(tuple);
+            }
+            return dataBag;
+        } catch (ClassCastException e) {
+            warn("class cast exception at " + e.getStackTrace()[0], PigWarning.UDF_WARNING_1);
+        } catch (PatternSyntaxException e) {
+            warn(e.getMessage(), PigWarning.UDF_WARNING_1);
+        }
+        // this only happens if the try block did not complete normally
+        return null;
+    }
+
+    @Override
+    public Schema outputSchema(Schema input) {
+        return new Schema(new Schema.FieldSchema(null, DataType.BAG));
+    }
+
+    @Override
+    public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+        List<FuncSpec> funcList = new ArrayList<FuncSpec>();
+        Schema s = new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY));
+
+        Schema s1 = new Schema();
+        s1.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+        s1.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+
+        Schema s2 = new Schema();
+        s2.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+        s2.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+        s2.add(new Schema.FieldSchema(null, DataType.INTEGER));
+
+        funcList.add(new FuncSpec(this.getClass().getName(), s));
+        funcList.add(new FuncSpec(this.getClass().getName(), s1));
+        funcList.add(new FuncSpec(this.getClass().getName(), s2));
+        return funcList;
+    }
+
+    @Override
+    public boolean allowCompileTimeCalculation() {
+        return true;
+    }
+}

Modified: pig/trunk/test/org/apache/pig/test/TestBuiltin.java
URL: http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/TestBuiltin.java?rev=1627713&r1=1627712&r2=1627713&view=diff
==============================================================================
--- pig/trunk/test/org/apache/pig/test/TestBuiltin.java (original)
+++ pig/trunk/test/org/apache/pig/test/TestBuiltin.java Fri Sep 26 04:57:34 2014
@@ -2731,14 +2731,16 @@ public class TestBuiltin {
         pigServer.registerQuery("=> load '" + Util.encodeEscape(inputFile.getAbsolutePath()) + "' as (name: chararray);");
         pigServer.registerQuery("B = foreach @ generate SUBSTRING(name, 0, 3), " +
             "INDEXOF(name, 'a'), INDEXOF(name, 'a', 3), LAST_INDEX_OF(name, 'a'), REPLACE(name, 'a', 'b'), " +
-            "STRSPLIT(name), STRSPLIT(name, ' '), STRSPLIT(name, ' ', 0), TRIM(name);");
+                "STRSPLIT(name), STRSPLIT(name, ' '), STRSPLIT(name, ' ', 0), STRSPLITTOBAG(name), STRSPLITTOBAG(name,' ')" +
+                ", STRSPLITTOBAG(name,' ',0), TRIM(name);");
 
         Iterator<Tuple> it = pigServer.openIterator("B");
         assertTrue(it.hasNext());
         Tuple t = it.next();
         Tuple expected = Util.buildTuple("amy", "smith");
+        DataBag expectedBag = Util.createBag(new Tuple[]{Util.buildTuple("amy"), Util.buildTuple("smith")});
         assertTrue(!it.hasNext());
-        assertEquals(9, t.size());
+        assertEquals(12, t.size());
         assertEquals("amy", t.get(0));
         assertEquals(0, t.get(1));
         assertEquals(-1, t.get(2));
@@ -2747,7 +2749,10 @@ public class TestBuiltin {
         assertEquals(expected, t.get(5));
         assertEquals(expected, t.get(6));
         assertEquals(expected, t.get(7));
-        assertEquals("amy smith", t.get(8));
+        assertEquals(expectedBag, t.get(8));
+        assertEquals(expectedBag, t.get(9));
+        assertEquals(expectedBag, t.get(10));
+        assertEquals("amy smith", t.get(11));
 
         // test untyped data
         pigServer.registerQuery("=> load '" + Util.encodeEscape(inputFile.getAbsolutePath()) + "' as (name);");

Modified: pig/trunk/test/org/apache/pig/test/TestStringUDFs.java
URL: http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/TestStringUDFs.java?rev=1627713&r1=1627712&r2=1627713&view=diff
==============================================================================
--- pig/trunk/test/org/apache/pig/test/TestStringUDFs.java (original)
+++ pig/trunk/test/org/apache/pig/test/TestStringUDFs.java Fri Sep 26 04:57:34 2014
@@ -32,11 +32,13 @@ import org.apache.pig.builtin.REPLACE;
 import org.apache.pig.builtin.STARTSWITH;
 import org.apache.pig.builtin.ENDSWITH;
 import org.apache.pig.builtin.STRSPLIT;
+import org.apache.pig.builtin.STRSPLITTOBAG;
 import org.apache.pig.builtin.SUBSTRING;
 import org.apache.pig.builtin.TRIM;
 import org.apache.pig.builtin.LTRIM;
 import org.apache.pig.builtin.RTRIM;
 import org.apache.pig.builtin.EqualsIgnoreCase;
+import org.apache.pig.data.DataBag;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.data.TupleFactory;
 import org.junit.Test;
@@ -227,6 +229,77 @@ public class TestStringUDFs {
     }
 
     @Test
+    public void testSplitToBag() throws IOException {
+        STRSPLITTOBAG bagSplit = new STRSPLITTOBAG();
+
+        //test no delims in input
+        Tuple testTuple = Util.buildTuple("1 2 3", "4");
+        DataBag outputBag = bagSplit.exec(testTuple);
+        assertEquals("No of records split should be 1", 1, outputBag.size());
+        assertEquals("Split string should match the input string", "(1 2 3)", outputBag.iterator().next().toString());
+
+        //test default delimiter
+        testTuple = Util.buildTuple("1 2 3");
+        outputBag = bagSplit.exec(testTuple);
+        String[] assertionArray = {"1", "2", "3"};
+        assertEquals("No of record split should be " + assertionArray.length, assertionArray.length, outputBag.size());
+
+        int i = 0;
+        for (Tuple t : outputBag) {
+            assertEquals("Assertion tests on split strings", "(" + assertionArray[i] + ")", t.toString());
+            i++;
+        }
+
+        //test split on specified delimiter
+        testTuple = Util.buildTuple("1:2:3", ":");
+        outputBag = bagSplit.exec(testTuple);
+        assertEquals("No of record split should be " + assertionArray.length, assertionArray.length, outputBag.size());
+        i = 0;
+        for (Tuple t : outputBag) {
+            assertEquals("Assertion tests on split strings", "(" + assertionArray[i] + ")", t.toString());
+            i++;
+        }
+
+        // test limiting results with limit
+        testTuple = Util.buildTuple("1:2:3", ":", 2);
+        outputBag = bagSplit.exec(testTuple);
+        assertionArray = new String[]{"1", "2:3"};
+        assertEquals("No of record split should be " + assertionArray.length, assertionArray.length, outputBag.size());
+        i = 0;
+        for (Tuple t : outputBag) {
+            assertEquals("Matched records in split results with limit", "(" + assertionArray[i] + ")", t.toString());
+            i++;
+        }
+
+        // test trimming of whitespace
+        testTuple = Util.buildTuple("1 2    ");
+        outputBag = bagSplit.exec(testTuple);
+        assertionArray = new String[]{"1", "2"};
+        assertEquals("No of record split should be " + assertionArray.length, assertionArray.length, outputBag.size());
+        i = 0;
+        for (Tuple t : outputBag) {
+            assertEquals("Matched records in split results with trimming of whitespaces", "(" + assertionArray[i] + ")", t.toString());
+            i++;
+        }
+
+        // test forcing null matches with length param
+        testTuple = Util.buildTuple("1:2:::", ":", 10);
+        outputBag = bagSplit.exec(testTuple);
+        assertionArray = new String[]{"1", "2", "", "", ""};
+        assertEquals("No of record split should be " + assertionArray.length, assertionArray.length, outputBag.size());
+        i = 0;
+        for (Tuple t : outputBag) {
+            assertEquals("Matched records in split results with forcing null matched with limit", "(" + assertionArray[i] + ")", t.toString());
+            i++;
+        }
+
+        //test wrong schemas
+        testTuple = Util.buildTuple(1, 2, 3);
+        outputBag = bagSplit.exec(testTuple);
+        assertEquals("Wrong Schema checks", null, outputBag);
+    }
+
+    @Test
     public void testStartsWith() throws IOException {
         STARTSWITH startsWith = new STARTSWITH();
         Tuple testTuple1 = Util.buildTuple("foo", "bar");