You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by da...@apache.org on 2014/09/26 06:57:34 UTC
svn commit: r1627713 - in /pig/trunk: ./
src/docs/src/documentation/content/xdocs/ src/org/apache/pig/builtin/
test/org/apache/pig/test/
Author: daijy
Date: Fri Sep 26 04:57:34 2014
New Revision: 1627713
URL: http://svn.apache.org/r1627713
Log:
PIG-3870: STRSPLITTOBAG UDF
Added:
pig/trunk/src/org/apache/pig/builtin/STRSPLITTOBAG.java
Modified:
pig/trunk/CHANGES.txt
pig/trunk/src/docs/src/documentation/content/xdocs/func.xml
pig/trunk/src/docs/src/documentation/content/xdocs/pig-index.xml
pig/trunk/test/org/apache/pig/test/TestBuiltin.java
pig/trunk/test/org/apache/pig/test/TestStringUDFs.java
Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1627713&r1=1627712&r2=1627713&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Fri Sep 26 04:57:34 2014
@@ -24,6 +24,8 @@ INCOMPATIBLE CHANGES
IMPROVEMENTS
+PIG-3870: STRSPLITTOBAG UDF (cryptoe via daijy)
+
PIG-4080: Add Preprocessor commands and more to the black/whitelisting feature (prkommireddi via daijy)
PIG-4162: Intermediate reducer parallelism in Tez should be higher (rohini)
Modified: pig/trunk/src/docs/src/documentation/content/xdocs/func.xml
URL: http://svn.apache.org/viewvc/pig/trunk/src/docs/src/documentation/content/xdocs/func.xml?rev=1627713&r1=1627712&r2=1627713&view=diff
==============================================================================
--- pig/trunk/src/docs/src/documentation/content/xdocs/func.xml (original)
+++ pig/trunk/src/docs/src/documentation/content/xdocs/func.xml Fri Sep 26 04:57:34 2014
@@ -4358,7 +4358,70 @@ For example, given the string (open:sour
</section>
</section>
-<!-- ======================================================== -->
+<!-- ======================================================== -->
+ <section id="strsplittobag">
+ <title>STRSPLITTOBAG</title>
+ <p>Splits a string around matches of a given regular expression and returns a databag</p>
+ <section>
+ <title>Syntax</title>
+ <table>
+ <tr>
+ <td>
+ <p>STRSPLITTOBAG(string, regex, limit)</p>
+ </td>
+ </tr>
+ </table>
+ </section>
+ <section>
+ <title>Terms</title>
+ <table>
+ <tr>
+ <td>
+ <p>string</p>
+ </td>
+ <td>
+ <p>The string to be split.</p>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <p>regex</p>
+ </td>
+ <td>
+ <p>The regular expression.</p>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <p>limit</p>
+ </td>
+ <td>
+ <p>If the value is positive, the pattern (the compiled representation of the regular expression)
+ is applied at most limit-1 times, therefore the value of the argument means the maximum size
+ of the result bag. The last tuple of the result bag will contain all input after the last
+ match.
+ </p>
+ <p>If the value is negative, no limit is applied to the size of the result bag.</p>
+ <p>If the value is zero, no limit is applied to the size of the result bag too, and trailing
+ empty strings (if any) will be removed.
+ </p>
+ </td>
+ </tr>
+ </table>
+ </section>
+ <section>
+ <title>Usage</title>
+ <p>
+ Use the STRSPLITTOBAG function to split a string around matches of a given regular expression.
+ </p>
+ <p>
+ For example, given the string (open:source:software), STRSPLITTOBAG (string, ':',2) will return
+ {(open),(source:software)} and STRSPLITTOBAG (string, ':',3) will return {(open),(source),(software)}.
+ </p>
+ </section>
+ </section>
+
+ <!-- ======================================================== -->
<section id="substring">
<title>SUBSTRING</title>
<p>Returns a substring from a given string. </p>
Modified: pig/trunk/src/docs/src/documentation/content/xdocs/pig-index.xml
URL: http://svn.apache.org/viewvc/pig/trunk/src/docs/src/documentation/content/xdocs/pig-index.xml?rev=1627713&r1=1627712&r2=1627713&view=diff
==============================================================================
--- pig/trunk/src/docs/src/documentation/content/xdocs/pig-index.xml (original)
+++ pig/trunk/src/docs/src/documentation/content/xdocs/pig-index.xml Fri Sep 26 04:57:34 2014
@@ -964,6 +964,8 @@
<p><a href="func.html#strsplit">STRSPLIT</a> function</p>
+<p><a href="func.html#strsplittobag">STRSPLITTOBAG</a> function</p>
+
<p><a href="func.html#substring">SUBSTRING</a> function</p>
<p><a href="func.html#sum">SUM</a> function</p>
Added: pig/trunk/src/org/apache/pig/builtin/STRSPLITTOBAG.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/builtin/STRSPLITTOBAG.java?rev=1627713&view=auto
==============================================================================
--- pig/trunk/src/org/apache/pig/builtin/STRSPLITTOBAG.java (added)
+++ pig/trunk/src/org/apache/pig/builtin/STRSPLITTOBAG.java Fri Sep 26 04:57:34 2014
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.pig.builtin;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.FuncSpec;
+import org.apache.pig.PigWarning;
+import org.apache.pig.data.*;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.PatternSyntaxException;
+
+/**
+ * Wrapper around Java's String.split<br>
+ * input tuple: first column is assumed to have a string to split;<br>
+ * the optional second column is assumed to have the delimiter or regex to split on;<br>
+ * if not provided, it's assumed to be '\s' (space)<br>
+ * the optional third column may provide a limit to the number of results.<br>
+ * If limit is not provided, 0 is assumed, as per Java's split().
+ */
+
+public class STRSPLITTOBAG extends EvalFunc<DataBag> {
+
+ private final static BagFactory bagFactory = BagFactory.getInstance();
+ private final static TupleFactory tupleFactory = TupleFactory.getInstance();
+
+ /**
+ * Wrapper around Java's String.split
+ *
+ * @param input tuple; first column is assumed to have a string to split;
+ * the optional second column is assumed to have the delimiter or regex to split on;<br>
+ * if not provided, it's assumed to be '\s' (space)
+ * the optional third column may provide a limit to the number of results.<br>
+ * If limit is not provided, 0 is assumed, as per Java's split().
+ * @throws java.io.IOException
+ */
+ @Override
+ public DataBag exec(Tuple input) throws IOException {
+ if (input == null || input.size() < 1) {
+ return null;
+ }
+ try {
+ String source = (String) input.get(0);
+ String delim = (input.size() > 1) ? (String) input.get(1) : "\\s";
+ int length = (input.size() > 2) ? (Integer) input.get(2) : 0;
+ if (source == null || delim == null) {
+ return null;
+ }
+
+ String[] splits = source.split(delim, length);
+ DataBag dataBag = bagFactory.newDefaultBag();
+ for (String eachSplit : splits) {
+ Tuple tuple = tupleFactory.newTuple(1);
+ tuple.set(0, eachSplit);
+ dataBag.add(tuple);
+ }
+ return dataBag;
+ } catch (ClassCastException e) {
+ warn("class cast exception at " + e.getStackTrace()[0], PigWarning.UDF_WARNING_1);
+ } catch (PatternSyntaxException e) {
+ warn(e.getMessage(), PigWarning.UDF_WARNING_1);
+ }
+ // this only happens if the try block did not complete normally
+ return null;
+ }
+
+ @Override
+ public Schema outputSchema(Schema input) {
+ return new Schema(new Schema.FieldSchema(null, DataType.BAG));
+ }
+
+ @Override
+ public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+ List<FuncSpec> funcList = new ArrayList<FuncSpec>();
+ Schema s = new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY));
+
+ Schema s1 = new Schema();
+ s1.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+ s1.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+
+ Schema s2 = new Schema();
+ s2.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+ s2.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+ s2.add(new Schema.FieldSchema(null, DataType.INTEGER));
+
+ funcList.add(new FuncSpec(this.getClass().getName(), s));
+ funcList.add(new FuncSpec(this.getClass().getName(), s1));
+ funcList.add(new FuncSpec(this.getClass().getName(), s2));
+ return funcList;
+ }
+
+ @Override
+ public boolean allowCompileTimeCalculation() {
+ return true;
+ }
+}
Modified: pig/trunk/test/org/apache/pig/test/TestBuiltin.java
URL: http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/TestBuiltin.java?rev=1627713&r1=1627712&r2=1627713&view=diff
==============================================================================
--- pig/trunk/test/org/apache/pig/test/TestBuiltin.java (original)
+++ pig/trunk/test/org/apache/pig/test/TestBuiltin.java Fri Sep 26 04:57:34 2014
@@ -2731,14 +2731,16 @@ public class TestBuiltin {
pigServer.registerQuery("=> load '" + Util.encodeEscape(inputFile.getAbsolutePath()) + "' as (name: chararray);");
pigServer.registerQuery("B = foreach @ generate SUBSTRING(name, 0, 3), " +
"INDEXOF(name, 'a'), INDEXOF(name, 'a', 3), LAST_INDEX_OF(name, 'a'), REPLACE(name, 'a', 'b'), " +
- "STRSPLIT(name), STRSPLIT(name, ' '), STRSPLIT(name, ' ', 0), TRIM(name);");
+ "STRSPLIT(name), STRSPLIT(name, ' '), STRSPLIT(name, ' ', 0), STRSPLITTOBAG(name), STRSPLITTOBAG(name,' ')" +
+ ", STRSPLITTOBAG(name,' ',0), TRIM(name);");
Iterator<Tuple> it = pigServer.openIterator("B");
assertTrue(it.hasNext());
Tuple t = it.next();
Tuple expected = Util.buildTuple("amy", "smith");
+ DataBag expectedBag = Util.createBag(new Tuple[]{Util.buildTuple("amy"), Util.buildTuple("smith")});
assertTrue(!it.hasNext());
- assertEquals(9, t.size());
+ assertEquals(12, t.size());
assertEquals("amy", t.get(0));
assertEquals(0, t.get(1));
assertEquals(-1, t.get(2));
@@ -2747,7 +2749,10 @@ public class TestBuiltin {
assertEquals(expected, t.get(5));
assertEquals(expected, t.get(6));
assertEquals(expected, t.get(7));
- assertEquals("amy smith", t.get(8));
+ assertEquals(expectedBag, t.get(8));
+ assertEquals(expectedBag, t.get(9));
+ assertEquals(expectedBag, t.get(10));
+ assertEquals("amy smith", t.get(11));
// test untyped data
pigServer.registerQuery("=> load '" + Util.encodeEscape(inputFile.getAbsolutePath()) + "' as (name);");
Modified: pig/trunk/test/org/apache/pig/test/TestStringUDFs.java
URL: http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/TestStringUDFs.java?rev=1627713&r1=1627712&r2=1627713&view=diff
==============================================================================
--- pig/trunk/test/org/apache/pig/test/TestStringUDFs.java (original)
+++ pig/trunk/test/org/apache/pig/test/TestStringUDFs.java Fri Sep 26 04:57:34 2014
@@ -32,11 +32,13 @@ import org.apache.pig.builtin.REPLACE;
import org.apache.pig.builtin.STARTSWITH;
import org.apache.pig.builtin.ENDSWITH;
import org.apache.pig.builtin.STRSPLIT;
+import org.apache.pig.builtin.STRSPLITTOBAG;
import org.apache.pig.builtin.SUBSTRING;
import org.apache.pig.builtin.TRIM;
import org.apache.pig.builtin.LTRIM;
import org.apache.pig.builtin.RTRIM;
import org.apache.pig.builtin.EqualsIgnoreCase;
+import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.junit.Test;
@@ -227,6 +229,77 @@ public class TestStringUDFs {
}
@Test
+ public void testSplitToBag() throws IOException {
+ STRSPLITTOBAG bagSplit = new STRSPLITTOBAG();
+
+ //test no delims in input
+ Tuple testTuple = Util.buildTuple("1 2 3", "4");
+ DataBag outputBag = bagSplit.exec(testTuple);
+ assertEquals("No of records split should be 1", 1, outputBag.size());
+ assertEquals("Split string should match the input string", "(1 2 3)", outputBag.iterator().next().toString());
+
+ //test default delimiter
+ testTuple = Util.buildTuple("1 2 3");
+ outputBag = bagSplit.exec(testTuple);
+ String[] assertionArray = {"1", "2", "3"};
+ assertEquals("No of record split should be " + assertionArray.length, assertionArray.length, outputBag.size());
+
+ int i = 0;
+ for (Tuple t : outputBag) {
+ assertEquals("Assertion tests on split strings", "(" + assertionArray[i] + ")", t.toString());
+ i++;
+ }
+
+ //test split on specified delimiter
+ testTuple = Util.buildTuple("1:2:3", ":");
+ outputBag = bagSplit.exec(testTuple);
+ assertEquals("No of record split should be " + assertionArray.length, assertionArray.length, outputBag.size());
+ i = 0;
+ for (Tuple t : outputBag) {
+ assertEquals("Assertion tests on split strings", "(" + assertionArray[i] + ")", t.toString());
+ i++;
+ }
+
+ // test limiting results with limit
+ testTuple = Util.buildTuple("1:2:3", ":", 2);
+ outputBag = bagSplit.exec(testTuple);
+ assertionArray = new String[]{"1", "2:3"};
+ assertEquals("No of record split should be " + assertionArray.length, assertionArray.length, outputBag.size());
+ i = 0;
+ for (Tuple t : outputBag) {
+ assertEquals("Matched records in split results with limit", "(" + assertionArray[i] + ")", t.toString());
+ i++;
+ }
+
+ // test trimming of whitespace
+ testTuple = Util.buildTuple("1 2 ");
+ outputBag = bagSplit.exec(testTuple);
+ assertionArray = new String[]{"1", "2"};
+ assertEquals("No of record split should be " + assertionArray.length, assertionArray.length, outputBag.size());
+ i = 0;
+ for (Tuple t : outputBag) {
+ assertEquals("Matched records in split results with trimming of whitespaces", "(" + assertionArray[i] + ")", t.toString());
+ i++;
+ }
+
+ // test forcing null matches with length param
+ testTuple = Util.buildTuple("1:2:::", ":", 10);
+ outputBag = bagSplit.exec(testTuple);
+ assertionArray = new String[]{"1", "2", "", "", ""};
+ assertEquals("No of record split should be " + assertionArray.length, assertionArray.length, outputBag.size());
+ i = 0;
+ for (Tuple t : outputBag) {
+ assertEquals("Matched records in split results with forcing null matched with limit", "(" + assertionArray[i] + ")", t.toString());
+ i++;
+ }
+
+ //test wrong schemas
+ testTuple = Util.buildTuple(1, 2, 3);
+ outputBag = bagSplit.exec(testTuple);
+ assertEquals("Wrong Schema checks", null, outputBag);
+ }
+
+ @Test
public void testStartsWith() throws IOException {
STARTSWITH startsWith = new STARTSWITH();
Tuple testTuple1 = Util.buildTuple("foo", "bar");