You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@pig.apache.org by "hc busy (JIRA)" <ji...@apache.org> on 2010/04/21 07:10:50 UTC

[jira] Created: (PIG-1385) UDF to create tuples and bags

UDF to create tuples and bags
-----------------------------

                 Key: PIG-1385
                 URL: https://issues.apache.org/jira/browse/PIG-1385
             Project: Pig
          Issue Type: New Feature
          Components: tools
            Reporter: hc busy


Based on this conversation:

totally, go for it, it'd be pretty straightforward to add this
functionality.
- Hide quoted text -



On Tue, Apr 20, 2010 at 6:45 PM, hc busy <hc...@gmail.com> wrote:

> Hey, while we're on the subject, and I have your attention, can we
> re-factor
> the UDF MaxTupleByFirstField to take constructor?
>
> *define customMaxTuple ExtremalTupleByNthField(n, 'min');*
> *G = group T by id;*
> *M = foreach T generate customMaxTuple(T);
> *
>
> Where n is the nth field, and the second parameter allows us to specify
> "min", "max", "median",  etc...
>
> Does this seem like something useful to everyone?
>
>
>
> On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
>
> > What about making them part of the language using symbols?
> >
> > instead of
> >
> > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> >
> > have language support
> >
> > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> >
> > or even:
> >
> > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> >
> >
> > Is there reason not to do the second or third other than being more
> > complicated?
> >
> > Certainly I'd volunteer to put the top implementation in to the util
> > package and submit them for builtin's, but the latter syntactic candies
> > seems more natural..
> >
> >
> >
> > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> >
> >> The grouping package in piggybank is left over from back when Pig
> allowed
> >> users to define grouping functions (0.1).  Functions like these should
> go in
> >> evaluation.util.
> >>
> >> However, I'd consider putting these in builtin (in main Pig) instead.
> >>  These are things everyone asks for and they seem like a reasonable
> addition
> >> to the core engine.  This will be more of a burden to write (as we'll
> hold
> >> them to a higher standard) but of more use to people as well.
> >>
> >> Alan.
> >>
> >>
> >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> >>
> >>  Some times I wonder... I mean, somebody went to the trouble of making a
> >>> path
> >>> called
> >>>
> >>> org.apache.pig.piggybank.grouping
> >>>
> >>> (where it seems like this code belong), but didn't check in any java
> code
> >>> into that package.
> >>>
> >>>
> >>> Any comment about where to put this kind of utility classes?
> >>>
> >>>
> >>>
> >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> >>>
> >>>  2010/4/19 hc busy <hc...@gmail.com>
> >>>>
> >>>>  That's just the way it is right now, you can't make bags or tuples
> >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> >>>>>
> >>>>> toBag()
> >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> >>>>> TupleToBag(); --some times you need it this way for some reason.
> >>>>>
> >>>>>
> >>>>>  Ok. I place my current code here, may be later I make a patch (if
> such
> >>>> implementation is acceptable of course).
> >>>>
> >>>> import org.apache.pig.EvalFunc;
> >>>> import org.apache.pig.data.BagFactory;
> >>>> import org.apache.pig.data.DataBag;
> >>>> import org.apache.pig.data.Tuple;
> >>>> import org.apache.pig.data.TupleFactory;
> >>>>
> >>>> import java.io.IOException;
> >>>>
> >>>> /**
> >>>> * Convert any sequence of fields to bag with specified count of
> >>>> fields<br>
> >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> >>>> *
> >>>> * @author astepachev
> >>>> */
> >>>> public class ToBag extends EvalFunc<DataBag> {
> >>>>  public BagFactory bagFactory;
> >>>>  public TupleFactory tupleFactory;
> >>>>
> >>>>  public ToBag() {
> >>>>      bagFactory = BagFactory.getInstance();
> >>>>      tupleFactory = TupleFactory.getInstance();
> >>>>  }
> >>>>
> >>>>  @Override
> >>>>  public DataBag exec(Tuple input) throws IOException {
> >>>>      if (input.isNull())
> >>>>          return null;
> >>>>      final DataBag bag = bagFactory.newDefaultBag();
> >>>>      final Integer couter = (Integer) input.get(0);
> >>>>      if (couter == null)
> >>>>          return null;
> >>>>      Tuple tuple = tupleFactory.newTuple();
> >>>>      for (int i = 0; i < input.size() - 1; i++) {
> >>>>          if (i % couter == 0) {
> >>>>              tuple = tupleFactory.newTuple();
> >>>>              bag.add(tuple);
> >>>>          }
> >>>>          tuple.append(input.get(i + 1));
> >>>>      }
> >>>>      return bag;
> >>>>  }
> >>>> }
> >>>>
> >>>> import org.apache.pig.ExecType;
> >>>> import org.apache.pig.PigServer;
> >>>> import org.junit.Before;
> >>>> import org.junit.Test;
> >>>>
> >>>> import java.io.IOException;
> >>>> import java.net.URISyntaxException;
> >>>> import java.net.URL;
> >>>>
> >>>> import static org.junit.Assert.assertTrue;
> >>>>
> >>>> /**
> >>>> * @author astepachev
> >>>> */
> >>>> public class ToBagTest {
> >>>>  PigServer pigServer;
> >>>>  URL inputTxt;
> >>>>
> >>>>  @Before
> >>>>  public void init() throws IOException, URISyntaxException {
> >>>>      pigServer = new PigServer(ExecType.LOCAL);
> >>>>      inputTxt =
> >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> >>>>  }
> >>>>
> >>>>  @Test
> >>>>  public void testSimple() throws IOException {
> >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> +
> >>>> "' using PigStorage(',') " +
> >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> >>>> d:chararray);");
> >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> >>>>
> >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> >>>>  }
> >>>> }
> >>>>
> >>>>
> >>
> >
>


-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


[jira] Commented: (PIG-1385) UDF to create tuples and bags

Posted by "Alan Gates (JIRA)" <ji...@apache.org>.
    [ https://issues.apache.org/jira/browse/PIG-1385?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12860359#action_12860359 ] 

Alan Gates commented on PIG-1385:
---------------------------------

I made a few cosmetic changes.  I added the license header to each of the three files, changed all the tabs to spaces, and moved the tests to the evaluation.util package under test.  Code looks good, tests pass.  If you're ok with my changes I'll check it in.

> UDF to create tuples and bags
> -----------------------------
>
>                 Key: PIG-1385
>                 URL: https://issues.apache.org/jira/browse/PIG-1385
>             Project: Pig
>          Issue Type: New Feature
>          Components: tools
>    Affects Versions: 0.6.0
>            Reporter: hc busy
>            Assignee: hc busy
>         Attachments: PIG-1385-trunk.patch
>
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> Based on this conversation:
> > On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
> >
> > > What about making them part of the language using symbols?
> > >
> > > instead of
> > >
> > > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> > >
> > > have language support
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> > >
> > > or even:
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> > >
> > >
> > > Is there reason not to do the second or third other than being more
> > > complicated?
> > >
> > > Certainly I'd volunteer to put the top implementation in to the util
> > > package and submit them for builtin's, but the latter syntactic candies
> > > seems more natural..
> > >
> > >
> > >
> > > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> > >
> > >> The grouping package in piggybank is left over from back when Pig
> > allowed
> > >> users to define grouping functions (0.1).  Functions like these should
> > go in
> > >> evaluation.util.
> > >>
> > >> However, I'd consider putting these in builtin (in main Pig) instead.
> > >>  These are things everyone asks for and they seem like a reasonable
> > addition
> > >> to the core engine.  This will be more of a burden to write (as we'll
> > hold
> > >> them to a higher standard) but of more use to people as well.
> > >>
> > >> Alan.
> > >>
> > >>
> > >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> > >>
> > >>  Some times I wonder... I mean, somebody went to the trouble of making a
> > >>> path
> > >>> called
> > >>>
> > >>> org.apache.pig.piggybank.grouping
> > >>>
> > >>> (where it seems like this code belong), but didn't check in any java
> > code
> > >>> into that package.
> > >>>
> > >>>
> > >>> Any comment about where to put this kind of utility classes?
> > >>>
> > >>>
> > >>>
> > >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> > >>>
> > >>>  2010/4/19 hc busy <hc...@gmail.com>
> > >>>>
> > >>>>  That's just the way it is right now, you can't make bags or tuples
> > >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> > >>>>>
> > >>>>> toBag()
> > >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> > >>>>> TupleToBag(); --some times you need it this way for some reason.
> > >>>>>
> > >>>>>
> > >>>>>  Ok. I place my current code here, may be later I make a patch (if
> > such
> > >>>> implementation is acceptable of course).
> > >>>>
> > >>>> import org.apache.pig.EvalFunc;
> > >>>> import org.apache.pig.data.BagFactory;
> > >>>> import org.apache.pig.data.DataBag;
> > >>>> import org.apache.pig.data.Tuple;
> > >>>> import org.apache.pig.data.TupleFactory;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>>
> > >>>> /**
> > >>>> * Convert any sequence of fields to bag with specified count of
> > >>>> fields<br>
> > >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> > >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> > >>>> *
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBag extends EvalFunc<DataBag> {
> > >>>>  public BagFactory bagFactory;
> > >>>>  public TupleFactory tupleFactory;
> > >>>>
> > >>>>  public ToBag() {
> > >>>>      bagFactory = BagFactory.getInstance();
> > >>>>      tupleFactory = TupleFactory.getInstance();
> > >>>>  }
> > >>>>
> > >>>>  @Override
> > >>>>  public DataBag exec(Tuple input) throws IOException {
> > >>>>      if (input.isNull())
> > >>>>          return null;
> > >>>>      final DataBag bag = bagFactory.newDefaultBag();
> > >>>>      final Integer couter = (Integer) input.get(0);
> > >>>>      if (couter == null)
> > >>>>          return null;
> > >>>>      Tuple tuple = tupleFactory.newTuple();
> > >>>>      for (int i = 0; i < input.size() - 1; i++) {
> > >>>>          if (i % couter == 0) {
> > >>>>              tuple = tupleFactory.newTuple();
> > >>>>              bag.add(tuple);
> > >>>>          }
> > >>>>          tuple.append(input.get(i + 1));
> > >>>>      }
> > >>>>      return bag;
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>> import org.apache.pig.ExecType;
> > >>>> import org.apache.pig.PigServer;
> > >>>> import org.junit.Before;
> > >>>> import org.junit.Test;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>> import java.net.URISyntaxException;
> > >>>> import java.net.URL;
> > >>>>
> > >>>> import static org.junit.Assert.assertTrue;
> > >>>>
> > >>>> /**
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBagTest {
> > >>>>  PigServer pigServer;
> > >>>>  URL inputTxt;
> > >>>>
> > >>>>  @Before
> > >>>>  public void init() throws IOException, URISyntaxException {
> > >>>>      pigServer = new PigServer(ExecType.LOCAL);
> > >>>>      inputTxt =
> > >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> > >>>>  }
> > >>>>
> > >>>>  @Test
> > >>>>  public void testSimple() throws IOException {
> > >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> > +
> > >>>> "' using PigStorage(',') " +
> > >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> > >>>> d:chararray);");
> > >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> > >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> > >>>>
> > >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> > >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> > >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>>
> > >>
> > >
> >

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


[jira] Commented: (PIG-1385) UDF to create tuples and bags

Posted by "Alan Gates (JIRA)" <ji...@apache.org>.
    [ https://issues.apache.org/jira/browse/PIG-1385?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12860315#action_12860315 ] 

Alan Gates commented on PIG-1385:
---------------------------------

I'll review this.

> UDF to create tuples and bags
> -----------------------------
>
>                 Key: PIG-1385
>                 URL: https://issues.apache.org/jira/browse/PIG-1385
>             Project: Pig
>          Issue Type: New Feature
>          Components: tools
>    Affects Versions: 0.6.0
>            Reporter: hc busy
>            Assignee: hc busy
>         Attachments: PIG-1385-trunk.patch
>
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> Based on this conversation:
> > On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
> >
> > > What about making them part of the language using symbols?
> > >
> > > instead of
> > >
> > > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> > >
> > > have language support
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> > >
> > > or even:
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> > >
> > >
> > > Is there reason not to do the second or third other than being more
> > > complicated?
> > >
> > > Certainly I'd volunteer to put the top implementation in to the util
> > > package and submit them for builtin's, but the latter syntactic candies
> > > seems more natural..
> > >
> > >
> > >
> > > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> > >
> > >> The grouping package in piggybank is left over from back when Pig
> > allowed
> > >> users to define grouping functions (0.1).  Functions like these should
> > go in
> > >> evaluation.util.
> > >>
> > >> However, I'd consider putting these in builtin (in main Pig) instead.
> > >>  These are things everyone asks for and they seem like a reasonable
> > addition
> > >> to the core engine.  This will be more of a burden to write (as we'll
> > hold
> > >> them to a higher standard) but of more use to people as well.
> > >>
> > >> Alan.
> > >>
> > >>
> > >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> > >>
> > >>  Some times I wonder... I mean, somebody went to the trouble of making a
> > >>> path
> > >>> called
> > >>>
> > >>> org.apache.pig.piggybank.grouping
> > >>>
> > >>> (where it seems like this code belong), but didn't check in any java
> > code
> > >>> into that package.
> > >>>
> > >>>
> > >>> Any comment about where to put this kind of utility classes?
> > >>>
> > >>>
> > >>>
> > >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> > >>>
> > >>>  2010/4/19 hc busy <hc...@gmail.com>
> > >>>>
> > >>>>  That's just the way it is right now, you can't make bags or tuples
> > >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> > >>>>>
> > >>>>> toBag()
> > >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> > >>>>> TupleToBag(); --some times you need it this way for some reason.
> > >>>>>
> > >>>>>
> > >>>>>  Ok. I place my current code here, may be later I make a patch (if
> > such
> > >>>> implementation is acceptable of course).
> > >>>>
> > >>>> import org.apache.pig.EvalFunc;
> > >>>> import org.apache.pig.data.BagFactory;
> > >>>> import org.apache.pig.data.DataBag;
> > >>>> import org.apache.pig.data.Tuple;
> > >>>> import org.apache.pig.data.TupleFactory;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>>
> > >>>> /**
> > >>>> * Convert any sequence of fields to bag with specified count of
> > >>>> fields<br>
> > >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> > >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> > >>>> *
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBag extends EvalFunc<DataBag> {
> > >>>>  public BagFactory bagFactory;
> > >>>>  public TupleFactory tupleFactory;
> > >>>>
> > >>>>  public ToBag() {
> > >>>>      bagFactory = BagFactory.getInstance();
> > >>>>      tupleFactory = TupleFactory.getInstance();
> > >>>>  }
> > >>>>
> > >>>>  @Override
> > >>>>  public DataBag exec(Tuple input) throws IOException {
> > >>>>      if (input.isNull())
> > >>>>          return null;
> > >>>>      final DataBag bag = bagFactory.newDefaultBag();
> > >>>>      final Integer couter = (Integer) input.get(0);
> > >>>>      if (couter == null)
> > >>>>          return null;
> > >>>>      Tuple tuple = tupleFactory.newTuple();
> > >>>>      for (int i = 0; i < input.size() - 1; i++) {
> > >>>>          if (i % couter == 0) {
> > >>>>              tuple = tupleFactory.newTuple();
> > >>>>              bag.add(tuple);
> > >>>>          }
> > >>>>          tuple.append(input.get(i + 1));
> > >>>>      }
> > >>>>      return bag;
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>> import org.apache.pig.ExecType;
> > >>>> import org.apache.pig.PigServer;
> > >>>> import org.junit.Before;
> > >>>> import org.junit.Test;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>> import java.net.URISyntaxException;
> > >>>> import java.net.URL;
> > >>>>
> > >>>> import static org.junit.Assert.assertTrue;
> > >>>>
> > >>>> /**
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBagTest {
> > >>>>  PigServer pigServer;
> > >>>>  URL inputTxt;
> > >>>>
> > >>>>  @Before
> > >>>>  public void init() throws IOException, URISyntaxException {
> > >>>>      pigServer = new PigServer(ExecType.LOCAL);
> > >>>>      inputTxt =
> > >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> > >>>>  }
> > >>>>
> > >>>>  @Test
> > >>>>  public void testSimple() throws IOException {
> > >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> > +
> > >>>> "' using PigStorage(',') " +
> > >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> > >>>> d:chararray);");
> > >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> > >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> > >>>>
> > >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> > >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> > >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>>
> > >>
> > >
> >

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


[jira] Commented: (PIG-1385) UDF to create tuples and bags

Posted by "Hadoop QA (JIRA)" <ji...@apache.org>.
    [ https://issues.apache.org/jira/browse/PIG-1385?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12860134#action_12860134 ] 

Hadoop QA commented on PIG-1385:
--------------------------------

-1 overall.  Here are the results of testing the latest attachment 
  http://issues.apache.org/jira/secure/attachment/12442630/PIG-1385-trunk.patch
  against trunk revision 937095.

    +1 @author.  The patch does not contain any @author tags.

    +1 tests included.  The patch appears to include 3 new or modified tests.

    +1 javadoc.  The javadoc tool did not generate any warning messages.

    +1 javac.  The applied patch does not increase the total number of javac compiler warnings.

    +1 findbugs.  The patch does not introduce any new Findbugs warnings.

    -1 release audit.  The applied patch generated 538 release audit warnings (more than the trunk's current 535 warnings).

    -1 core tests.  The patch failed core unit tests.

    +1 contrib tests.  The patch passed contrib unit tests.

Test results: http://hudson.zones.apache.org/hudson/job/Pig-Patch-h7.grid.sp2.yahoo.net/300/testReport/
Release audit warnings: http://hudson.zones.apache.org/hudson/job/Pig-Patch-h7.grid.sp2.yahoo.net/300/artifact/trunk/patchprocess/releaseAuditDiffWarnings.txt
Findbugs warnings: http://hudson.zones.apache.org/hudson/job/Pig-Patch-h7.grid.sp2.yahoo.net/300/artifact/trunk/build/test/findbugs/newPatchFindbugsWarnings.html
Console output: http://hudson.zones.apache.org/hudson/job/Pig-Patch-h7.grid.sp2.yahoo.net/300/console

This message is automatically generated.

> UDF to create tuples and bags
> -----------------------------
>
>                 Key: PIG-1385
>                 URL: https://issues.apache.org/jira/browse/PIG-1385
>             Project: Pig
>          Issue Type: New Feature
>          Components: tools
>    Affects Versions: 0.6.0
>            Reporter: hc busy
>            Assignee: hc busy
>         Attachments: PIG-1385-trunk.patch
>
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> Based on this conversation:
> > On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
> >
> > > What about making them part of the language using symbols?
> > >
> > > instead of
> > >
> > > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> > >
> > > have language support
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> > >
> > > or even:
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> > >
> > >
> > > Is there reason not to do the second or third other than being more
> > > complicated?
> > >
> > > Certainly I'd volunteer to put the top implementation in to the util
> > > package and submit them for builtin's, but the latter syntactic candies
> > > seems more natural..
> > >
> > >
> > >
> > > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> > >
> > >> The grouping package in piggybank is left over from back when Pig
> > allowed
> > >> users to define grouping functions (0.1).  Functions like these should
> > go in
> > >> evaluation.util.
> > >>
> > >> However, I'd consider putting these in builtin (in main Pig) instead.
> > >>  These are things everyone asks for and they seem like a reasonable
> > addition
> > >> to the core engine.  This will be more of a burden to write (as we'll
> > hold
> > >> them to a higher standard) but of more use to people as well.
> > >>
> > >> Alan.
> > >>
> > >>
> > >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> > >>
> > >>  Some times I wonder... I mean, somebody went to the trouble of making a
> > >>> path
> > >>> called
> > >>>
> > >>> org.apache.pig.piggybank.grouping
> > >>>
> > >>> (where it seems like this code belong), but didn't check in any java
> > code
> > >>> into that package.
> > >>>
> > >>>
> > >>> Any comment about where to put this kind of utility classes?
> > >>>
> > >>>
> > >>>
> > >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> > >>>
> > >>>  2010/4/19 hc busy <hc...@gmail.com>
> > >>>>
> > >>>>  That's just the way it is right now, you can't make bags or tuples
> > >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> > >>>>>
> > >>>>> toBag()
> > >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> > >>>>> TupleToBag(); --some times you need it this way for some reason.
> > >>>>>
> > >>>>>
> > >>>>>  Ok. I place my current code here, may be later I make a patch (if
> > such
> > >>>> implementation is acceptable of course).
> > >>>>
> > >>>> import org.apache.pig.EvalFunc;
> > >>>> import org.apache.pig.data.BagFactory;
> > >>>> import org.apache.pig.data.DataBag;
> > >>>> import org.apache.pig.data.Tuple;
> > >>>> import org.apache.pig.data.TupleFactory;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>>
> > >>>> /**
> > >>>> * Convert any sequence of fields to bag with specified count of
> > >>>> fields<br>
> > >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> > >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> > >>>> *
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBag extends EvalFunc<DataBag> {
> > >>>>  public BagFactory bagFactory;
> > >>>>  public TupleFactory tupleFactory;
> > >>>>
> > >>>>  public ToBag() {
> > >>>>      bagFactory = BagFactory.getInstance();
> > >>>>      tupleFactory = TupleFactory.getInstance();
> > >>>>  }
> > >>>>
> > >>>>  @Override
> > >>>>  public DataBag exec(Tuple input) throws IOException {
> > >>>>      if (input.isNull())
> > >>>>          return null;
> > >>>>      final DataBag bag = bagFactory.newDefaultBag();
> > >>>>      final Integer couter = (Integer) input.get(0);
> > >>>>      if (couter == null)
> > >>>>          return null;
> > >>>>      Tuple tuple = tupleFactory.newTuple();
> > >>>>      for (int i = 0; i < input.size() - 1; i++) {
> > >>>>          if (i % couter == 0) {
> > >>>>              tuple = tupleFactory.newTuple();
> > >>>>              bag.add(tuple);
> > >>>>          }
> > >>>>          tuple.append(input.get(i + 1));
> > >>>>      }
> > >>>>      return bag;
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>> import org.apache.pig.ExecType;
> > >>>> import org.apache.pig.PigServer;
> > >>>> import org.junit.Before;
> > >>>> import org.junit.Test;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>> import java.net.URISyntaxException;
> > >>>> import java.net.URL;
> > >>>>
> > >>>> import static org.junit.Assert.assertTrue;
> > >>>>
> > >>>> /**
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBagTest {
> > >>>>  PigServer pigServer;
> > >>>>  URL inputTxt;
> > >>>>
> > >>>>  @Before
> > >>>>  public void init() throws IOException, URISyntaxException {
> > >>>>      pigServer = new PigServer(ExecType.LOCAL);
> > >>>>      inputTxt =
> > >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> > >>>>  }
> > >>>>
> > >>>>  @Test
> > >>>>  public void testSimple() throws IOException {
> > >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> > +
> > >>>> "' using PigStorage(',') " +
> > >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> > >>>> d:chararray);");
> > >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> > >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> > >>>>
> > >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> > >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> > >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>>
> > >>
> > >
> >

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


[jira] Updated: (PIG-1385) UDF to create tuples and bags

Posted by "hc busy (JIRA)" <ji...@apache.org>.
     [ https://issues.apache.org/jira/browse/PIG-1385?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

hc busy updated PIG-1385:
-------------------------

    Status: Patch Available  (was: Open)

> UDF to create tuples and bags
> -----------------------------
>
>                 Key: PIG-1385
>                 URL: https://issues.apache.org/jira/browse/PIG-1385
>             Project: Pig
>          Issue Type: New Feature
>          Components: tools
>    Affects Versions: 0.6.0
>            Reporter: hc busy
>         Attachments: PIG-1385-trunk.patch
>
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> Based on this conversation:
> > On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
> >
> > > What about making them part of the language using symbols?
> > >
> > > instead of
> > >
> > > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> > >
> > > have language support
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> > >
> > > or even:
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> > >
> > >
> > > Is there reason not to do the second or third other than being more
> > > complicated?
> > >
> > > Certainly I'd volunteer to put the top implementation in to the util
> > > package and submit them for builtin's, but the latter syntactic candies
> > > seems more natural..
> > >
> > >
> > >
> > > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> > >
> > >> The grouping package in piggybank is left over from back when Pig
> > allowed
> > >> users to define grouping functions (0.1).  Functions like these should
> > go in
> > >> evaluation.util.
> > >>
> > >> However, I'd consider putting these in builtin (in main Pig) instead.
> > >>  These are things everyone asks for and they seem like a reasonable
> > addition
> > >> to the core engine.  This will be more of a burden to write (as we'll
> > hold
> > >> them to a higher standard) but of more use to people as well.
> > >>
> > >> Alan.
> > >>
> > >>
> > >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> > >>
> > >>  Some times I wonder... I mean, somebody went to the trouble of making a
> > >>> path
> > >>> called
> > >>>
> > >>> org.apache.pig.piggybank.grouping
> > >>>
> > >>> (where it seems like this code belong), but didn't check in any java
> > code
> > >>> into that package.
> > >>>
> > >>>
> > >>> Any comment about where to put this kind of utility classes?
> > >>>
> > >>>
> > >>>
> > >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> > >>>
> > >>>  2010/4/19 hc busy <hc...@gmail.com>
> > >>>>
> > >>>>  That's just the way it is right now, you can't make bags or tuples
> > >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> > >>>>>
> > >>>>> toBag()
> > >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> > >>>>> TupleToBag(); --some times you need it this way for some reason.
> > >>>>>
> > >>>>>
> > >>>>>  Ok. I place my current code here, may be later I make a patch (if
> > such
> > >>>> implementation is acceptable of course).
> > >>>>
> > >>>> import org.apache.pig.EvalFunc;
> > >>>> import org.apache.pig.data.BagFactory;
> > >>>> import org.apache.pig.data.DataBag;
> > >>>> import org.apache.pig.data.Tuple;
> > >>>> import org.apache.pig.data.TupleFactory;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>>
> > >>>> /**
> > >>>> * Convert any sequence of fields to bag with specified count of
> > >>>> fields<br>
> > >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> > >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> > >>>> *
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBag extends EvalFunc<DataBag> {
> > >>>>  public BagFactory bagFactory;
> > >>>>  public TupleFactory tupleFactory;
> > >>>>
> > >>>>  public ToBag() {
> > >>>>      bagFactory = BagFactory.getInstance();
> > >>>>      tupleFactory = TupleFactory.getInstance();
> > >>>>  }
> > >>>>
> > >>>>  @Override
> > >>>>  public DataBag exec(Tuple input) throws IOException {
> > >>>>      if (input.isNull())
> > >>>>          return null;
> > >>>>      final DataBag bag = bagFactory.newDefaultBag();
> > >>>>      final Integer couter = (Integer) input.get(0);
> > >>>>      if (couter == null)
> > >>>>          return null;
> > >>>>      Tuple tuple = tupleFactory.newTuple();
> > >>>>      for (int i = 0; i < input.size() - 1; i++) {
> > >>>>          if (i % couter == 0) {
> > >>>>              tuple = tupleFactory.newTuple();
> > >>>>              bag.add(tuple);
> > >>>>          }
> > >>>>          tuple.append(input.get(i + 1));
> > >>>>      }
> > >>>>      return bag;
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>> import org.apache.pig.ExecType;
> > >>>> import org.apache.pig.PigServer;
> > >>>> import org.junit.Before;
> > >>>> import org.junit.Test;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>> import java.net.URISyntaxException;
> > >>>> import java.net.URL;
> > >>>>
> > >>>> import static org.junit.Assert.assertTrue;
> > >>>>
> > >>>> /**
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBagTest {
> > >>>>  PigServer pigServer;
> > >>>>  URL inputTxt;
> > >>>>
> > >>>>  @Before
> > >>>>  public void init() throws IOException, URISyntaxException {
> > >>>>      pigServer = new PigServer(ExecType.LOCAL);
> > >>>>      inputTxt =
> > >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> > >>>>  }
> > >>>>
> > >>>>  @Test
> > >>>>  public void testSimple() throws IOException {
> > >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> > +
> > >>>> "' using PigStorage(',') " +
> > >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> > >>>> d:chararray);");
> > >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> > >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> > >>>>
> > >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> > >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> > >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>>
> > >>
> > >
> >

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


[jira] Updated: (PIG-1385) UDF to create tuples and bags

Posted by "hc busy (JIRA)" <ji...@apache.org>.
     [ https://issues.apache.org/jira/browse/PIG-1385?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

hc busy updated PIG-1385:
-------------------------

    Attachment: PIG-1385-trunk.patch

changed so that the unit test builds and runs.

> UDF to create tuples and bags
> -----------------------------
>
>                 Key: PIG-1385
>                 URL: https://issues.apache.org/jira/browse/PIG-1385
>             Project: Pig
>          Issue Type: New Feature
>          Components: tools
>    Affects Versions: 0.6.0
>            Reporter: hc busy
>            Assignee: hc busy
>         Attachments: PIG-1385-trunk.patch
>
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> Based on this conversation:
> > On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
> >
> > > What about making them part of the language using symbols?
> > >
> > > instead of
> > >
> > > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> > >
> > > have language support
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> > >
> > > or even:
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> > >
> > >
> > > Is there reason not to do the second or third other than being more
> > > complicated?
> > >
> > > Certainly I'd volunteer to put the top implementation in to the util
> > > package and submit them for builtin's, but the latter syntactic candies
> > > seems more natural..
> > >
> > >
> > >
> > > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> > >
> > >> The grouping package in piggybank is left over from back when Pig
> > allowed
> > >> users to define grouping functions (0.1).  Functions like these should
> > go in
> > >> evaluation.util.
> > >>
> > >> However, I'd consider putting these in builtin (in main Pig) instead.
> > >>  These are things everyone asks for and they seem like a reasonable
> > addition
> > >> to the core engine.  This will be more of a burden to write (as we'll
> > hold
> > >> them to a higher standard) but of more use to people as well.
> > >>
> > >> Alan.
> > >>
> > >>
> > >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> > >>
> > >>  Some times I wonder... I mean, somebody went to the trouble of making a
> > >>> path
> > >>> called
> > >>>
> > >>> org.apache.pig.piggybank.grouping
> > >>>
> > >>> (where it seems like this code belong), but didn't check in any java
> > code
> > >>> into that package.
> > >>>
> > >>>
> > >>> Any comment about where to put this kind of utility classes?
> > >>>
> > >>>
> > >>>
> > >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> > >>>
> > >>>  2010/4/19 hc busy <hc...@gmail.com>
> > >>>>
> > >>>>  That's just the way it is right now, you can't make bags or tuples
> > >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> > >>>>>
> > >>>>> toBag()
> > >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> > >>>>> TupleToBag(); --some times you need it this way for some reason.
> > >>>>>
> > >>>>>
> > >>>>>  Ok. I place my current code here, may be later I make a patch (if
> > such
> > >>>> implementation is acceptable of course).
> > >>>>
> > >>>> import org.apache.pig.EvalFunc;
> > >>>> import org.apache.pig.data.BagFactory;
> > >>>> import org.apache.pig.data.DataBag;
> > >>>> import org.apache.pig.data.Tuple;
> > >>>> import org.apache.pig.data.TupleFactory;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>>
> > >>>> /**
> > >>>> * Convert any sequence of fields to bag with specified count of
> > >>>> fields<br>
> > >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> > >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> > >>>> *
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBag extends EvalFunc<DataBag> {
> > >>>>  public BagFactory bagFactory;
> > >>>>  public TupleFactory tupleFactory;
> > >>>>
> > >>>>  public ToBag() {
> > >>>>      bagFactory = BagFactory.getInstance();
> > >>>>      tupleFactory = TupleFactory.getInstance();
> > >>>>  }
> > >>>>
> > >>>>  @Override
> > >>>>  public DataBag exec(Tuple input) throws IOException {
> > >>>>      if (input.isNull())
> > >>>>          return null;
> > >>>>      final DataBag bag = bagFactory.newDefaultBag();
> > >>>>      final Integer couter = (Integer) input.get(0);
> > >>>>      if (couter == null)
> > >>>>          return null;
> > >>>>      Tuple tuple = tupleFactory.newTuple();
> > >>>>      for (int i = 0; i < input.size() - 1; i++) {
> > >>>>          if (i % couter == 0) {
> > >>>>              tuple = tupleFactory.newTuple();
> > >>>>              bag.add(tuple);
> > >>>>          }
> > >>>>          tuple.append(input.get(i + 1));
> > >>>>      }
> > >>>>      return bag;
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>> import org.apache.pig.ExecType;
> > >>>> import org.apache.pig.PigServer;
> > >>>> import org.junit.Before;
> > >>>> import org.junit.Test;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>> import java.net.URISyntaxException;
> > >>>> import java.net.URL;
> > >>>>
> > >>>> import static org.junit.Assert.assertTrue;
> > >>>>
> > >>>> /**
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBagTest {
> > >>>>  PigServer pigServer;
> > >>>>  URL inputTxt;
> > >>>>
> > >>>>  @Before
> > >>>>  public void init() throws IOException, URISyntaxException {
> > >>>>      pigServer = new PigServer(ExecType.LOCAL);
> > >>>>      inputTxt =
> > >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> > >>>>  }
> > >>>>
> > >>>>  @Test
> > >>>>  public void testSimple() throws IOException {
> > >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> > +
> > >>>> "' using PigStorage(',') " +
> > >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> > >>>> d:chararray);");
> > >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> > >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> > >>>>
> > >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> > >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> > >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>>
> > >>
> > >
> >

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


[jira] Commented: (PIG-1385) UDF to create tuples and bags

Posted by "Hadoop QA (JIRA)" <ji...@apache.org>.
    [ https://issues.apache.org/jira/browse/PIG-1385?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12859297#action_12859297 ] 

Hadoop QA commented on PIG-1385:
--------------------------------

-1 overall.  Here are the results of testing the latest attachment 
  http://issues.apache.org/jira/secure/attachment/12442398/PIG-1385-trunk.patch
  against trunk revision 935968.

    +1 @author.  The patch does not contain any @author tags.

    +1 tests included.  The patch appears to include 3 new or modified tests.

    +1 javadoc.  The javadoc tool did not generate any warning messages.

    +1 javac.  The applied patch does not increase the total number of javac compiler warnings.

    +1 findbugs.  The patch does not introduce any new Findbugs warnings.

    -1 release audit.  The applied patch generated 531 release audit warnings (more than the trunk's current 528 warnings).

    -1 core tests.  The patch failed core unit tests.

    +1 contrib tests.  The patch passed contrib unit tests.

Test results: http://hudson.zones.apache.org/hudson/job/Pig-Patch-h8.grid.sp2.yahoo.net/299/testReport/
Release audit warnings: http://hudson.zones.apache.org/hudson/job/Pig-Patch-h8.grid.sp2.yahoo.net/299/artifact/trunk/patchprocess/releaseAuditDiffWarnings.txt
Findbugs warnings: http://hudson.zones.apache.org/hudson/job/Pig-Patch-h8.grid.sp2.yahoo.net/299/artifact/trunk/build/test/findbugs/newPatchFindbugsWarnings.html
Console output: http://hudson.zones.apache.org/hudson/job/Pig-Patch-h8.grid.sp2.yahoo.net/299/console

This message is automatically generated.

> UDF to create tuples and bags
> -----------------------------
>
>                 Key: PIG-1385
>                 URL: https://issues.apache.org/jira/browse/PIG-1385
>             Project: Pig
>          Issue Type: New Feature
>          Components: tools
>    Affects Versions: 0.6.0
>            Reporter: hc busy
>         Attachments: PIG-1385-trunk.patch
>
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> Based on this conversation:
> > On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
> >
> > > What about making them part of the language using symbols?
> > >
> > > instead of
> > >
> > > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> > >
> > > have language support
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> > >
> > > or even:
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> > >
> > >
> > > Is there reason not to do the second or third other than being more
> > > complicated?
> > >
> > > Certainly I'd volunteer to put the top implementation in to the util
> > > package and submit them for builtin's, but the latter syntactic candies
> > > seems more natural..
> > >
> > >
> > >
> > > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> > >
> > >> The grouping package in piggybank is left over from back when Pig
> > allowed
> > >> users to define grouping functions (0.1).  Functions like these should
> > go in
> > >> evaluation.util.
> > >>
> > >> However, I'd consider putting these in builtin (in main Pig) instead.
> > >>  These are things everyone asks for and they seem like a reasonable
> > addition
> > >> to the core engine.  This will be more of a burden to write (as we'll
> > hold
> > >> them to a higher standard) but of more use to people as well.
> > >>
> > >> Alan.
> > >>
> > >>
> > >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> > >>
> > >>  Some times I wonder... I mean, somebody went to the trouble of making a
> > >>> path
> > >>> called
> > >>>
> > >>> org.apache.pig.piggybank.grouping
> > >>>
> > >>> (where it seems like this code belong), but didn't check in any java
> > code
> > >>> into that package.
> > >>>
> > >>>
> > >>> Any comment about where to put this kind of utility classes?
> > >>>
> > >>>
> > >>>
> > >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> > >>>
> > >>>  2010/4/19 hc busy <hc...@gmail.com>
> > >>>>
> > >>>>  That's just the way it is right now, you can't make bags or tuples
> > >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> > >>>>>
> > >>>>> toBag()
> > >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> > >>>>> TupleToBag(); --some times you need it this way for some reason.
> > >>>>>
> > >>>>>
> > >>>>>  Ok. I place my current code here, may be later I make a patch (if
> > such
> > >>>> implementation is acceptable of course).
> > >>>>
> > >>>> import org.apache.pig.EvalFunc;
> > >>>> import org.apache.pig.data.BagFactory;
> > >>>> import org.apache.pig.data.DataBag;
> > >>>> import org.apache.pig.data.Tuple;
> > >>>> import org.apache.pig.data.TupleFactory;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>>
> > >>>> /**
> > >>>> * Convert any sequence of fields to bag with specified count of
> > >>>> fields<br>
> > >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> > >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> > >>>> *
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBag extends EvalFunc<DataBag> {
> > >>>>  public BagFactory bagFactory;
> > >>>>  public TupleFactory tupleFactory;
> > >>>>
> > >>>>  public ToBag() {
> > >>>>      bagFactory = BagFactory.getInstance();
> > >>>>      tupleFactory = TupleFactory.getInstance();
> > >>>>  }
> > >>>>
> > >>>>  @Override
> > >>>>  public DataBag exec(Tuple input) throws IOException {
> > >>>>      if (input.isNull())
> > >>>>          return null;
> > >>>>      final DataBag bag = bagFactory.newDefaultBag();
> > >>>>      final Integer couter = (Integer) input.get(0);
> > >>>>      if (couter == null)
> > >>>>          return null;
> > >>>>      Tuple tuple = tupleFactory.newTuple();
> > >>>>      for (int i = 0; i < input.size() - 1; i++) {
> > >>>>          if (i % couter == 0) {
> > >>>>              tuple = tupleFactory.newTuple();
> > >>>>              bag.add(tuple);
> > >>>>          }
> > >>>>          tuple.append(input.get(i + 1));
> > >>>>      }
> > >>>>      return bag;
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>> import org.apache.pig.ExecType;
> > >>>> import org.apache.pig.PigServer;
> > >>>> import org.junit.Before;
> > >>>> import org.junit.Test;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>> import java.net.URISyntaxException;
> > >>>> import java.net.URL;
> > >>>>
> > >>>> import static org.junit.Assert.assertTrue;
> > >>>>
> > >>>> /**
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBagTest {
> > >>>>  PigServer pigServer;
> > >>>>  URL inputTxt;
> > >>>>
> > >>>>  @Before
> > >>>>  public void init() throws IOException, URISyntaxException {
> > >>>>      pigServer = new PigServer(ExecType.LOCAL);
> > >>>>      inputTxt =
> > >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> > >>>>  }
> > >>>>
> > >>>>  @Test
> > >>>>  public void testSimple() throws IOException {
> > >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> > +
> > >>>> "' using PigStorage(',') " +
> > >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> > >>>> d:chararray);");
> > >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> > >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> > >>>>
> > >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> > >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> > >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>>
> > >>
> > >
> >

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


[jira] Updated: (PIG-1385) UDF to create tuples and bags

Posted by "hc busy (JIRA)" <ji...@apache.org>.
     [ https://issues.apache.org/jira/browse/PIG-1385?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

hc busy updated PIG-1385:
-------------------------

    Affects Version/s: 0.6.0
          Description: 
Based on this conversation:

> On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
>
> > What about making them part of the language using symbols?
> >
> > instead of
> >
> > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> >
> > have language support
> >
> > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> >
> > or even:
> >
> > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> >
> >
> > Is there reason not to do the second or third other than being more
> > complicated?
> >
> > Certainly I'd volunteer to put the top implementation in to the util
> > package and submit them for builtin's, but the latter syntactic candies
> > seems more natural..
> >
> >
> >
> > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> >
> >> The grouping package in piggybank is left over from back when Pig
> allowed
> >> users to define grouping functions (0.1).  Functions like these should
> go in
> >> evaluation.util.
> >>
> >> However, I'd consider putting these in builtin (in main Pig) instead.
> >>  These are things everyone asks for and they seem like a reasonable
> addition
> >> to the core engine.  This will be more of a burden to write (as we'll
> hold
> >> them to a higher standard) but of more use to people as well.
> >>
> >> Alan.
> >>
> >>
> >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> >>
> >>  Some times I wonder... I mean, somebody went to the trouble of making a
> >>> path
> >>> called
> >>>
> >>> org.apache.pig.piggybank.grouping
> >>>
> >>> (where it seems like this code belong), but didn't check in any java
> code
> >>> into that package.
> >>>
> >>>
> >>> Any comment about where to put this kind of utility classes?
> >>>
> >>>
> >>>
> >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> >>>
> >>>  2010/4/19 hc busy <hc...@gmail.com>
> >>>>
> >>>>  That's just the way it is right now, you can't make bags or tuples
> >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> >>>>>
> >>>>> toBag()
> >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> >>>>> TupleToBag(); --some times you need it this way for some reason.
> >>>>>
> >>>>>
> >>>>>  Ok. I place my current code here, may be later I make a patch (if
> such
> >>>> implementation is acceptable of course).
> >>>>
> >>>> import org.apache.pig.EvalFunc;
> >>>> import org.apache.pig.data.BagFactory;
> >>>> import org.apache.pig.data.DataBag;
> >>>> import org.apache.pig.data.Tuple;
> >>>> import org.apache.pig.data.TupleFactory;
> >>>>
> >>>> import java.io.IOException;
> >>>>
> >>>> /**
> >>>> * Convert any sequence of fields to bag with specified count of
> >>>> fields<br>
> >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> >>>> *
> >>>> * @author astepachev
> >>>> */
> >>>> public class ToBag extends EvalFunc<DataBag> {
> >>>>  public BagFactory bagFactory;
> >>>>  public TupleFactory tupleFactory;
> >>>>
> >>>>  public ToBag() {
> >>>>      bagFactory = BagFactory.getInstance();
> >>>>      tupleFactory = TupleFactory.getInstance();
> >>>>  }
> >>>>
> >>>>  @Override
> >>>>  public DataBag exec(Tuple input) throws IOException {
> >>>>      if (input.isNull())
> >>>>          return null;
> >>>>      final DataBag bag = bagFactory.newDefaultBag();
> >>>>      final Integer couter = (Integer) input.get(0);
> >>>>      if (couter == null)
> >>>>          return null;
> >>>>      Tuple tuple = tupleFactory.newTuple();
> >>>>      for (int i = 0; i < input.size() - 1; i++) {
> >>>>          if (i % couter == 0) {
> >>>>              tuple = tupleFactory.newTuple();
> >>>>              bag.add(tuple);
> >>>>          }
> >>>>          tuple.append(input.get(i + 1));
> >>>>      }
> >>>>      return bag;
> >>>>  }
> >>>> }
> >>>>
> >>>> import org.apache.pig.ExecType;
> >>>> import org.apache.pig.PigServer;
> >>>> import org.junit.Before;
> >>>> import org.junit.Test;
> >>>>
> >>>> import java.io.IOException;
> >>>> import java.net.URISyntaxException;
> >>>> import java.net.URL;
> >>>>
> >>>> import static org.junit.Assert.assertTrue;
> >>>>
> >>>> /**
> >>>> * @author astepachev
> >>>> */
> >>>> public class ToBagTest {
> >>>>  PigServer pigServer;
> >>>>  URL inputTxt;
> >>>>
> >>>>  @Before
> >>>>  public void init() throws IOException, URISyntaxException {
> >>>>      pigServer = new PigServer(ExecType.LOCAL);
> >>>>      inputTxt =
> >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> >>>>  }
> >>>>
> >>>>  @Test
> >>>>  public void testSimple() throws IOException {
> >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> +
> >>>> "' using PigStorage(',') " +
> >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> >>>> d:chararray);");
> >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> >>>>
> >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> >>>>  }
> >>>> }
> >>>>
> >>>>
> >>
> >
>


  was:
Based on this conversation:

totally, go for it, it'd be pretty straightforward to add this
functionality.
- Hide quoted text -



On Tue, Apr 20, 2010 at 6:45 PM, hc busy <hc...@gmail.com> wrote:

> Hey, while we're on the subject, and I have your attention, can we
> re-factor
> the UDF MaxTupleByFirstField to take constructor?
>
> *define customMaxTuple ExtremalTupleByNthField(n, 'min');*
> *G = group T by id;*
> *M = foreach T generate customMaxTuple(T);
> *
>
> Where n is the nth field, and the second parameter allows us to specify
> "min", "max", "median",  etc...
>
> Does this seem like something useful to everyone?
>
>
>
> On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
>
> > What about making them part of the language using symbols?
> >
> > instead of
> >
> > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> >
> > have language support
> >
> > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> >
> > or even:
> >
> > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> >
> >
> > Is there reason not to do the second or third other than being more
> > complicated?
> >
> > Certainly I'd volunteer to put the top implementation in to the util
> > package and submit them for builtin's, but the latter syntactic candies
> > seems more natural..
> >
> >
> >
> > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> >
> >> The grouping package in piggybank is left over from back when Pig
> allowed
> >> users to define grouping functions (0.1).  Functions like these should
> go in
> >> evaluation.util.
> >>
> >> However, I'd consider putting these in builtin (in main Pig) instead.
> >>  These are things everyone asks for and they seem like a reasonable
> addition
> >> to the core engine.  This will be more of a burden to write (as we'll
> hold
> >> them to a higher standard) but of more use to people as well.
> >>
> >> Alan.
> >>
> >>
> >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> >>
> >>  Some times I wonder... I mean, somebody went to the trouble of making a
> >>> path
> >>> called
> >>>
> >>> org.apache.pig.piggybank.grouping
> >>>
> >>> (where it seems like this code belong), but didn't check in any java
> code
> >>> into that package.
> >>>
> >>>
> >>> Any comment about where to put this kind of utility classes?
> >>>
> >>>
> >>>
> >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> >>>
> >>>  2010/4/19 hc busy <hc...@gmail.com>
> >>>>
> >>>>  That's just the way it is right now, you can't make bags or tuples
> >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> >>>>>
> >>>>> toBag()
> >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> >>>>> TupleToBag(); --some times you need it this way for some reason.
> >>>>>
> >>>>>
> >>>>>  Ok. I place my current code here, may be later I make a patch (if
> such
> >>>> implementation is acceptable of course).
> >>>>
> >>>> import org.apache.pig.EvalFunc;
> >>>> import org.apache.pig.data.BagFactory;
> >>>> import org.apache.pig.data.DataBag;
> >>>> import org.apache.pig.data.Tuple;
> >>>> import org.apache.pig.data.TupleFactory;
> >>>>
> >>>> import java.io.IOException;
> >>>>
> >>>> /**
> >>>> * Convert any sequence of fields to bag with specified count of
> >>>> fields<br>
> >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> >>>> *
> >>>> * @author astepachev
> >>>> */
> >>>> public class ToBag extends EvalFunc<DataBag> {
> >>>>  public BagFactory bagFactory;
> >>>>  public TupleFactory tupleFactory;
> >>>>
> >>>>  public ToBag() {
> >>>>      bagFactory = BagFactory.getInstance();
> >>>>      tupleFactory = TupleFactory.getInstance();
> >>>>  }
> >>>>
> >>>>  @Override
> >>>>  public DataBag exec(Tuple input) throws IOException {
> >>>>      if (input.isNull())
> >>>>          return null;
> >>>>      final DataBag bag = bagFactory.newDefaultBag();
> >>>>      final Integer couter = (Integer) input.get(0);
> >>>>      if (couter == null)
> >>>>          return null;
> >>>>      Tuple tuple = tupleFactory.newTuple();
> >>>>      for (int i = 0; i < input.size() - 1; i++) {
> >>>>          if (i % couter == 0) {
> >>>>              tuple = tupleFactory.newTuple();
> >>>>              bag.add(tuple);
> >>>>          }
> >>>>          tuple.append(input.get(i + 1));
> >>>>      }
> >>>>      return bag;
> >>>>  }
> >>>> }
> >>>>
> >>>> import org.apache.pig.ExecType;
> >>>> import org.apache.pig.PigServer;
> >>>> import org.junit.Before;
> >>>> import org.junit.Test;
> >>>>
> >>>> import java.io.IOException;
> >>>> import java.net.URISyntaxException;
> >>>> import java.net.URL;
> >>>>
> >>>> import static org.junit.Assert.assertTrue;
> >>>>
> >>>> /**
> >>>> * @author astepachev
> >>>> */
> >>>> public class ToBagTest {
> >>>>  PigServer pigServer;
> >>>>  URL inputTxt;
> >>>>
> >>>>  @Before
> >>>>  public void init() throws IOException, URISyntaxException {
> >>>>      pigServer = new PigServer(ExecType.LOCAL);
> >>>>      inputTxt =
> >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> >>>>  }
> >>>>
> >>>>  @Test
> >>>>  public void testSimple() throws IOException {
> >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> +
> >>>> "' using PigStorage(',') " +
> >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> >>>> d:chararray);");
> >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> >>>>
> >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> >>>>  }
> >>>> }
> >>>>
> >>>>
> >>
> >
>



> UDF to create tuples and bags
> -----------------------------
>
>                 Key: PIG-1385
>                 URL: https://issues.apache.org/jira/browse/PIG-1385
>             Project: Pig
>          Issue Type: New Feature
>          Components: tools
>    Affects Versions: 0.6.0
>            Reporter: hc busy
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> Based on this conversation:
> > On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
> >
> > > What about making them part of the language using symbols?
> > >
> > > instead of
> > >
> > > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> > >
> > > have language support
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> > >
> > > or even:
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> > >
> > >
> > > Is there reason not to do the second or third other than being more
> > > complicated?
> > >
> > > Certainly I'd volunteer to put the top implementation in to the util
> > > package and submit them for builtin's, but the latter syntactic candies
> > > seems more natural..
> > >
> > >
> > >
> > > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> > >
> > >> The grouping package in piggybank is left over from back when Pig
> > allowed
> > >> users to define grouping functions (0.1).  Functions like these should
> > go in
> > >> evaluation.util.
> > >>
> > >> However, I'd consider putting these in builtin (in main Pig) instead.
> > >>  These are things everyone asks for and they seem like a reasonable
> > addition
> > >> to the core engine.  This will be more of a burden to write (as we'll
> > hold
> > >> them to a higher standard) but of more use to people as well.
> > >>
> > >> Alan.
> > >>
> > >>
> > >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> > >>
> > >>  Some times I wonder... I mean, somebody went to the trouble of making a
> > >>> path
> > >>> called
> > >>>
> > >>> org.apache.pig.piggybank.grouping
> > >>>
> > >>> (where it seems like this code belong), but didn't check in any java
> > code
> > >>> into that package.
> > >>>
> > >>>
> > >>> Any comment about where to put this kind of utility classes?
> > >>>
> > >>>
> > >>>
> > >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> > >>>
> > >>>  2010/4/19 hc busy <hc...@gmail.com>
> > >>>>
> > >>>>  That's just the way it is right now, you can't make bags or tuples
> > >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> > >>>>>
> > >>>>> toBag()
> > >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> > >>>>> TupleToBag(); --some times you need it this way for some reason.
> > >>>>>
> > >>>>>
> > >>>>>  Ok. I place my current code here, may be later I make a patch (if
> > such
> > >>>> implementation is acceptable of course).
> > >>>>
> > >>>> import org.apache.pig.EvalFunc;
> > >>>> import org.apache.pig.data.BagFactory;
> > >>>> import org.apache.pig.data.DataBag;
> > >>>> import org.apache.pig.data.Tuple;
> > >>>> import org.apache.pig.data.TupleFactory;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>>
> > >>>> /**
> > >>>> * Convert any sequence of fields to bag with specified count of
> > >>>> fields<br>
> > >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> > >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> > >>>> *
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBag extends EvalFunc<DataBag> {
> > >>>>  public BagFactory bagFactory;
> > >>>>  public TupleFactory tupleFactory;
> > >>>>
> > >>>>  public ToBag() {
> > >>>>      bagFactory = BagFactory.getInstance();
> > >>>>      tupleFactory = TupleFactory.getInstance();
> > >>>>  }
> > >>>>
> > >>>>  @Override
> > >>>>  public DataBag exec(Tuple input) throws IOException {
> > >>>>      if (input.isNull())
> > >>>>          return null;
> > >>>>      final DataBag bag = bagFactory.newDefaultBag();
> > >>>>      final Integer couter = (Integer) input.get(0);
> > >>>>      if (couter == null)
> > >>>>          return null;
> > >>>>      Tuple tuple = tupleFactory.newTuple();
> > >>>>      for (int i = 0; i < input.size() - 1; i++) {
> > >>>>          if (i % couter == 0) {
> > >>>>              tuple = tupleFactory.newTuple();
> > >>>>              bag.add(tuple);
> > >>>>          }
> > >>>>          tuple.append(input.get(i + 1));
> > >>>>      }
> > >>>>      return bag;
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>> import org.apache.pig.ExecType;
> > >>>> import org.apache.pig.PigServer;
> > >>>> import org.junit.Before;
> > >>>> import org.junit.Test;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>> import java.net.URISyntaxException;
> > >>>> import java.net.URL;
> > >>>>
> > >>>> import static org.junit.Assert.assertTrue;
> > >>>>
> > >>>> /**
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBagTest {
> > >>>>  PigServer pigServer;
> > >>>>  URL inputTxt;
> > >>>>
> > >>>>  @Before
> > >>>>  public void init() throws IOException, URISyntaxException {
> > >>>>      pigServer = new PigServer(ExecType.LOCAL);
> > >>>>      inputTxt =
> > >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> > >>>>  }
> > >>>>
> > >>>>  @Test
> > >>>>  public void testSimple() throws IOException {
> > >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> > +
> > >>>> "' using PigStorage(',') " +
> > >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> > >>>> d:chararray);");
> > >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> > >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> > >>>>
> > >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> > >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> > >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>>
> > >>
> > >
> >

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


[jira] Assigned: (PIG-1385) UDF to create tuples and bags

Posted by "Alan Gates (JIRA)" <ji...@apache.org>.
     [ https://issues.apache.org/jira/browse/PIG-1385?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Alan Gates reassigned PIG-1385:
-------------------------------

    Assignee: hc busy

> UDF to create tuples and bags
> -----------------------------
>
>                 Key: PIG-1385
>                 URL: https://issues.apache.org/jira/browse/PIG-1385
>             Project: Pig
>          Issue Type: New Feature
>          Components: tools
>    Affects Versions: 0.6.0
>            Reporter: hc busy
>            Assignee: hc busy
>         Attachments: PIG-1385-trunk.patch
>
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> Based on this conversation:
> > On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
> >
> > > What about making them part of the language using symbols?
> > >
> > > instead of
> > >
> > > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> > >
> > > have language support
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> > >
> > > or even:
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> > >
> > >
> > > Is there reason not to do the second or third other than being more
> > > complicated?
> > >
> > > Certainly I'd volunteer to put the top implementation in to the util
> > > package and submit them for builtin's, but the latter syntactic candies
> > > seems more natural..
> > >
> > >
> > >
> > > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> > >
> > >> The grouping package in piggybank is left over from back when Pig
> > allowed
> > >> users to define grouping functions (0.1).  Functions like these should
> > go in
> > >> evaluation.util.
> > >>
> > >> However, I'd consider putting these in builtin (in main Pig) instead.
> > >>  These are things everyone asks for and they seem like a reasonable
> > addition
> > >> to the core engine.  This will be more of a burden to write (as we'll
> > hold
> > >> them to a higher standard) but of more use to people as well.
> > >>
> > >> Alan.
> > >>
> > >>
> > >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> > >>
> > >>  Some times I wonder... I mean, somebody went to the trouble of making a
> > >>> path
> > >>> called
> > >>>
> > >>> org.apache.pig.piggybank.grouping
> > >>>
> > >>> (where it seems like this code belong), but didn't check in any java
> > code
> > >>> into that package.
> > >>>
> > >>>
> > >>> Any comment about where to put this kind of utility classes?
> > >>>
> > >>>
> > >>>
> > >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> > >>>
> > >>>  2010/4/19 hc busy <hc...@gmail.com>
> > >>>>
> > >>>>  That's just the way it is right now, you can't make bags or tuples
> > >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> > >>>>>
> > >>>>> toBag()
> > >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> > >>>>> TupleToBag(); --some times you need it this way for some reason.
> > >>>>>
> > >>>>>
> > >>>>>  Ok. I place my current code here, may be later I make a patch (if
> > such
> > >>>> implementation is acceptable of course).
> > >>>>
> > >>>> import org.apache.pig.EvalFunc;
> > >>>> import org.apache.pig.data.BagFactory;
> > >>>> import org.apache.pig.data.DataBag;
> > >>>> import org.apache.pig.data.Tuple;
> > >>>> import org.apache.pig.data.TupleFactory;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>>
> > >>>> /**
> > >>>> * Convert any sequence of fields to bag with specified count of
> > >>>> fields<br>
> > >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> > >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> > >>>> *
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBag extends EvalFunc<DataBag> {
> > >>>>  public BagFactory bagFactory;
> > >>>>  public TupleFactory tupleFactory;
> > >>>>
> > >>>>  public ToBag() {
> > >>>>      bagFactory = BagFactory.getInstance();
> > >>>>      tupleFactory = TupleFactory.getInstance();
> > >>>>  }
> > >>>>
> > >>>>  @Override
> > >>>>  public DataBag exec(Tuple input) throws IOException {
> > >>>>      if (input.isNull())
> > >>>>          return null;
> > >>>>      final DataBag bag = bagFactory.newDefaultBag();
> > >>>>      final Integer couter = (Integer) input.get(0);
> > >>>>      if (couter == null)
> > >>>>          return null;
> > >>>>      Tuple tuple = tupleFactory.newTuple();
> > >>>>      for (int i = 0; i < input.size() - 1; i++) {
> > >>>>          if (i % couter == 0) {
> > >>>>              tuple = tupleFactory.newTuple();
> > >>>>              bag.add(tuple);
> > >>>>          }
> > >>>>          tuple.append(input.get(i + 1));
> > >>>>      }
> > >>>>      return bag;
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>> import org.apache.pig.ExecType;
> > >>>> import org.apache.pig.PigServer;
> > >>>> import org.junit.Before;
> > >>>> import org.junit.Test;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>> import java.net.URISyntaxException;
> > >>>> import java.net.URL;
> > >>>>
> > >>>> import static org.junit.Assert.assertTrue;
> > >>>>
> > >>>> /**
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBagTest {
> > >>>>  PigServer pigServer;
> > >>>>  URL inputTxt;
> > >>>>
> > >>>>  @Before
> > >>>>  public void init() throws IOException, URISyntaxException {
> > >>>>      pigServer = new PigServer(ExecType.LOCAL);
> > >>>>      inputTxt =
> > >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> > >>>>  }
> > >>>>
> > >>>>  @Test
> > >>>>  public void testSimple() throws IOException {
> > >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> > +
> > >>>> "' using PigStorage(',') " +
> > >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> > >>>> d:chararray);");
> > >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> > >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> > >>>>
> > >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> > >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> > >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>>
> > >>
> > >
> >

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


[jira] Updated: (PIG-1385) UDF to create tuples and bags

Posted by "Alan Gates (JIRA)" <ji...@apache.org>.
     [ https://issues.apache.org/jira/browse/PIG-1385?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Alan Gates updated PIG-1385:
----------------------------

           Status: Resolved  (was: Patch Available)
    Fix Version/s: 0.8.0
       Resolution: Fixed

I ran the tests that failed on the last hudson run and they all passed.  I also ran the commit tests and piggybank tests.  All looked good.  Patch checked in.  Thanks hc.

> UDF to create tuples and bags
> -----------------------------
>
>                 Key: PIG-1385
>                 URL: https://issues.apache.org/jira/browse/PIG-1385
>             Project: Pig
>          Issue Type: New Feature
>          Components: tools
>    Affects Versions: 0.6.0
>            Reporter: hc busy
>            Assignee: hc busy
>             Fix For: 0.8.0
>
>         Attachments: PIG-1385-trunk.patch
>
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> Based on this conversation:
> > On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
> >
> > > What about making them part of the language using symbols?
> > >
> > > instead of
> > >
> > > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> > >
> > > have language support
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> > >
> > > or even:
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> > >
> > >
> > > Is there reason not to do the second or third other than being more
> > > complicated?
> > >
> > > Certainly I'd volunteer to put the top implementation in to the util
> > > package and submit them for builtin's, but the latter syntactic candies
> > > seems more natural..
> > >
> > >
> > >
> > > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> > >
> > >> The grouping package in piggybank is left over from back when Pig
> > allowed
> > >> users to define grouping functions (0.1).  Functions like these should
> > go in
> > >> evaluation.util.
> > >>
> > >> However, I'd consider putting these in builtin (in main Pig) instead.
> > >>  These are things everyone asks for and they seem like a reasonable
> > addition
> > >> to the core engine.  This will be more of a burden to write (as we'll
> > hold
> > >> them to a higher standard) but of more use to people as well.
> > >>
> > >> Alan.
> > >>
> > >>
> > >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> > >>
> > >>  Some times I wonder... I mean, somebody went to the trouble of making a
> > >>> path
> > >>> called
> > >>>
> > >>> org.apache.pig.piggybank.grouping
> > >>>
> > >>> (where it seems like this code belong), but didn't check in any java
> > code
> > >>> into that package.
> > >>>
> > >>>
> > >>> Any comment about where to put this kind of utility classes?
> > >>>
> > >>>
> > >>>
> > >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> > >>>
> > >>>  2010/4/19 hc busy <hc...@gmail.com>
> > >>>>
> > >>>>  That's just the way it is right now, you can't make bags or tuples
> > >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> > >>>>>
> > >>>>> toBag()
> > >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> > >>>>> TupleToBag(); --some times you need it this way for some reason.
> > >>>>>
> > >>>>>
> > >>>>>  Ok. I place my current code here, may be later I make a patch (if
> > such
> > >>>> implementation is acceptable of course).
> > >>>>
> > >>>> import org.apache.pig.EvalFunc;
> > >>>> import org.apache.pig.data.BagFactory;
> > >>>> import org.apache.pig.data.DataBag;
> > >>>> import org.apache.pig.data.Tuple;
> > >>>> import org.apache.pig.data.TupleFactory;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>>
> > >>>> /**
> > >>>> * Convert any sequence of fields to bag with specified count of
> > >>>> fields<br>
> > >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> > >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> > >>>> *
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBag extends EvalFunc<DataBag> {
> > >>>>  public BagFactory bagFactory;
> > >>>>  public TupleFactory tupleFactory;
> > >>>>
> > >>>>  public ToBag() {
> > >>>>      bagFactory = BagFactory.getInstance();
> > >>>>      tupleFactory = TupleFactory.getInstance();
> > >>>>  }
> > >>>>
> > >>>>  @Override
> > >>>>  public DataBag exec(Tuple input) throws IOException {
> > >>>>      if (input.isNull())
> > >>>>          return null;
> > >>>>      final DataBag bag = bagFactory.newDefaultBag();
> > >>>>      final Integer couter = (Integer) input.get(0);
> > >>>>      if (couter == null)
> > >>>>          return null;
> > >>>>      Tuple tuple = tupleFactory.newTuple();
> > >>>>      for (int i = 0; i < input.size() - 1; i++) {
> > >>>>          if (i % couter == 0) {
> > >>>>              tuple = tupleFactory.newTuple();
> > >>>>              bag.add(tuple);
> > >>>>          }
> > >>>>          tuple.append(input.get(i + 1));
> > >>>>      }
> > >>>>      return bag;
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>> import org.apache.pig.ExecType;
> > >>>> import org.apache.pig.PigServer;
> > >>>> import org.junit.Before;
> > >>>> import org.junit.Test;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>> import java.net.URISyntaxException;
> > >>>> import java.net.URL;
> > >>>>
> > >>>> import static org.junit.Assert.assertTrue;
> > >>>>
> > >>>> /**
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBagTest {
> > >>>>  PigServer pigServer;
> > >>>>  URL inputTxt;
> > >>>>
> > >>>>  @Before
> > >>>>  public void init() throws IOException, URISyntaxException {
> > >>>>      pigServer = new PigServer(ExecType.LOCAL);
> > >>>>      inputTxt =
> > >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> > >>>>  }
> > >>>>
> > >>>>  @Test
> > >>>>  public void testSimple() throws IOException {
> > >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> > +
> > >>>> "' using PigStorage(',') " +
> > >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> > >>>> d:chararray);");
> > >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> > >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> > >>>>
> > >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> > >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> > >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>>
> > >>
> > >
> >

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


[jira] Updated: (PIG-1385) UDF to create tuples and bags

Posted by "hc busy (JIRA)" <ji...@apache.org>.
     [ https://issues.apache.org/jira/browse/PIG-1385?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

hc busy updated PIG-1385:
-------------------------

    Status: Open  (was: Patch Available)

> UDF to create tuples and bags
> -----------------------------
>
>                 Key: PIG-1385
>                 URL: https://issues.apache.org/jira/browse/PIG-1385
>             Project: Pig
>          Issue Type: New Feature
>          Components: tools
>    Affects Versions: 0.6.0
>            Reporter: hc busy
>            Assignee: hc busy
>         Attachments: PIG-1385-trunk.patch
>
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> Based on this conversation:
> > On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
> >
> > > What about making them part of the language using symbols?
> > >
> > > instead of
> > >
> > > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> > >
> > > have language support
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> > >
> > > or even:
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> > >
> > >
> > > Is there reason not to do the second or third other than being more
> > > complicated?
> > >
> > > Certainly I'd volunteer to put the top implementation in to the util
> > > package and submit them for builtin's, but the latter syntactic candies
> > > seems more natural..
> > >
> > >
> > >
> > > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> > >
> > >> The grouping package in piggybank is left over from back when Pig
> > allowed
> > >> users to define grouping functions (0.1).  Functions like these should
> > go in
> > >> evaluation.util.
> > >>
> > >> However, I'd consider putting these in builtin (in main Pig) instead.
> > >>  These are things everyone asks for and they seem like a reasonable
> > addition
> > >> to the core engine.  This will be more of a burden to write (as we'll
> > hold
> > >> them to a higher standard) but of more use to people as well.
> > >>
> > >> Alan.
> > >>
> > >>
> > >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> > >>
> > >>  Some times I wonder... I mean, somebody went to the trouble of making a
> > >>> path
> > >>> called
> > >>>
> > >>> org.apache.pig.piggybank.grouping
> > >>>
> > >>> (where it seems like this code belong), but didn't check in any java
> > code
> > >>> into that package.
> > >>>
> > >>>
> > >>> Any comment about where to put this kind of utility classes?
> > >>>
> > >>>
> > >>>
> > >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> > >>>
> > >>>  2010/4/19 hc busy <hc...@gmail.com>
> > >>>>
> > >>>>  That's just the way it is right now, you can't make bags or tuples
> > >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> > >>>>>
> > >>>>> toBag()
> > >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> > >>>>> TupleToBag(); --some times you need it this way for some reason.
> > >>>>>
> > >>>>>
> > >>>>>  Ok. I place my current code here, may be later I make a patch (if
> > such
> > >>>> implementation is acceptable of course).
> > >>>>
> > >>>> import org.apache.pig.EvalFunc;
> > >>>> import org.apache.pig.data.BagFactory;
> > >>>> import org.apache.pig.data.DataBag;
> > >>>> import org.apache.pig.data.Tuple;
> > >>>> import org.apache.pig.data.TupleFactory;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>>
> > >>>> /**
> > >>>> * Convert any sequence of fields to bag with specified count of
> > >>>> fields<br>
> > >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> > >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> > >>>> *
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBag extends EvalFunc<DataBag> {
> > >>>>  public BagFactory bagFactory;
> > >>>>  public TupleFactory tupleFactory;
> > >>>>
> > >>>>  public ToBag() {
> > >>>>      bagFactory = BagFactory.getInstance();
> > >>>>      tupleFactory = TupleFactory.getInstance();
> > >>>>  }
> > >>>>
> > >>>>  @Override
> > >>>>  public DataBag exec(Tuple input) throws IOException {
> > >>>>      if (input.isNull())
> > >>>>          return null;
> > >>>>      final DataBag bag = bagFactory.newDefaultBag();
> > >>>>      final Integer couter = (Integer) input.get(0);
> > >>>>      if (couter == null)
> > >>>>          return null;
> > >>>>      Tuple tuple = tupleFactory.newTuple();
> > >>>>      for (int i = 0; i < input.size() - 1; i++) {
> > >>>>          if (i % couter == 0) {
> > >>>>              tuple = tupleFactory.newTuple();
> > >>>>              bag.add(tuple);
> > >>>>          }
> > >>>>          tuple.append(input.get(i + 1));
> > >>>>      }
> > >>>>      return bag;
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>> import org.apache.pig.ExecType;
> > >>>> import org.apache.pig.PigServer;
> > >>>> import org.junit.Before;
> > >>>> import org.junit.Test;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>> import java.net.URISyntaxException;
> > >>>> import java.net.URL;
> > >>>>
> > >>>> import static org.junit.Assert.assertTrue;
> > >>>>
> > >>>> /**
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBagTest {
> > >>>>  PigServer pigServer;
> > >>>>  URL inputTxt;
> > >>>>
> > >>>>  @Before
> > >>>>  public void init() throws IOException, URISyntaxException {
> > >>>>      pigServer = new PigServer(ExecType.LOCAL);
> > >>>>      inputTxt =
> > >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> > >>>>  }
> > >>>>
> > >>>>  @Test
> > >>>>  public void testSimple() throws IOException {
> > >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> > +
> > >>>> "' using PigStorage(',') " +
> > >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> > >>>> d:chararray);");
> > >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> > >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> > >>>>
> > >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> > >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> > >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>>
> > >>
> > >
> >

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


[jira] Commented: (PIG-1385) UDF to create tuples and bags

Posted by "hc busy (JIRA)" <ji...@apache.org>.
    [ https://issues.apache.org/jira/browse/PIG-1385?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12860365#action_12860365 ] 

hc busy commented on PIG-1385:
------------------------------

ok, ok, moving tests to evaluation.util requires that you import the classes under test.

Here we usually have tests in the same package (but sitting under test/ instead of src/) so we can test package protected methods. Also so we don't have to import the CUT. But other than that, I guess I should follow convention. I agree with these changes.

> UDF to create tuples and bags
> -----------------------------
>
>                 Key: PIG-1385
>                 URL: https://issues.apache.org/jira/browse/PIG-1385
>             Project: Pig
>          Issue Type: New Feature
>          Components: tools
>    Affects Versions: 0.6.0
>            Reporter: hc busy
>            Assignee: hc busy
>         Attachments: PIG-1385-trunk.patch
>
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> Based on this conversation:
> > On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
> >
> > > What about making them part of the language using symbols?
> > >
> > > instead of
> > >
> > > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> > >
> > > have language support
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> > >
> > > or even:
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> > >
> > >
> > > Is there reason not to do the second or third other than being more
> > > complicated?
> > >
> > > Certainly I'd volunteer to put the top implementation in to the util
> > > package and submit them for builtin's, but the latter syntactic candies
> > > seems more natural..
> > >
> > >
> > >
> > > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> > >
> > >> The grouping package in piggybank is left over from back when Pig
> > allowed
> > >> users to define grouping functions (0.1).  Functions like these should
> > go in
> > >> evaluation.util.
> > >>
> > >> However, I'd consider putting these in builtin (in main Pig) instead.
> > >>  These are things everyone asks for and they seem like a reasonable
> > addition
> > >> to the core engine.  This will be more of a burden to write (as we'll
> > hold
> > >> them to a higher standard) but of more use to people as well.
> > >>
> > >> Alan.
> > >>
> > >>
> > >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> > >>
> > >>  Some times I wonder... I mean, somebody went to the trouble of making a
> > >>> path
> > >>> called
> > >>>
> > >>> org.apache.pig.piggybank.grouping
> > >>>
> > >>> (where it seems like this code belong), but didn't check in any java
> > code
> > >>> into that package.
> > >>>
> > >>>
> > >>> Any comment about where to put this kind of utility classes?
> > >>>
> > >>>
> > >>>
> > >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> > >>>
> > >>>  2010/4/19 hc busy <hc...@gmail.com>
> > >>>>
> > >>>>  That's just the way it is right now, you can't make bags or tuples
> > >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> > >>>>>
> > >>>>> toBag()
> > >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> > >>>>> TupleToBag(); --some times you need it this way for some reason.
> > >>>>>
> > >>>>>
> > >>>>>  Ok. I place my current code here, may be later I make a patch (if
> > such
> > >>>> implementation is acceptable of course).
> > >>>>
> > >>>> import org.apache.pig.EvalFunc;
> > >>>> import org.apache.pig.data.BagFactory;
> > >>>> import org.apache.pig.data.DataBag;
> > >>>> import org.apache.pig.data.Tuple;
> > >>>> import org.apache.pig.data.TupleFactory;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>>
> > >>>> /**
> > >>>> * Convert any sequence of fields to bag with specified count of
> > >>>> fields<br>
> > >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> > >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> > >>>> *
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBag extends EvalFunc<DataBag> {
> > >>>>  public BagFactory bagFactory;
> > >>>>  public TupleFactory tupleFactory;
> > >>>>
> > >>>>  public ToBag() {
> > >>>>      bagFactory = BagFactory.getInstance();
> > >>>>      tupleFactory = TupleFactory.getInstance();
> > >>>>  }
> > >>>>
> > >>>>  @Override
> > >>>>  public DataBag exec(Tuple input) throws IOException {
> > >>>>      if (input.isNull())
> > >>>>          return null;
> > >>>>      final DataBag bag = bagFactory.newDefaultBag();
> > >>>>      final Integer couter = (Integer) input.get(0);
> > >>>>      if (couter == null)
> > >>>>          return null;
> > >>>>      Tuple tuple = tupleFactory.newTuple();
> > >>>>      for (int i = 0; i < input.size() - 1; i++) {
> > >>>>          if (i % couter == 0) {
> > >>>>              tuple = tupleFactory.newTuple();
> > >>>>              bag.add(tuple);
> > >>>>          }
> > >>>>          tuple.append(input.get(i + 1));
> > >>>>      }
> > >>>>      return bag;
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>> import org.apache.pig.ExecType;
> > >>>> import org.apache.pig.PigServer;
> > >>>> import org.junit.Before;
> > >>>> import org.junit.Test;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>> import java.net.URISyntaxException;
> > >>>> import java.net.URL;
> > >>>>
> > >>>> import static org.junit.Assert.assertTrue;
> > >>>>
> > >>>> /**
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBagTest {
> > >>>>  PigServer pigServer;
> > >>>>  URL inputTxt;
> > >>>>
> > >>>>  @Before
> > >>>>  public void init() throws IOException, URISyntaxException {
> > >>>>      pigServer = new PigServer(ExecType.LOCAL);
> > >>>>      inputTxt =
> > >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> > >>>>  }
> > >>>>
> > >>>>  @Test
> > >>>>  public void testSimple() throws IOException {
> > >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> > +
> > >>>> "' using PigStorage(',') " +
> > >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> > >>>> d:chararray);");
> > >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> > >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> > >>>>
> > >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> > >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> > >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>>
> > >>
> > >
> >

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


[jira] Commented: (PIG-1385) UDF to create tuples and bags

Posted by "Daniel Lescohier (JIRA)" <ji...@apache.org>.
    [ https://issues.apache.org/jira/browse/PIG-1385?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12867244#action_12867244 ] 

Daniel Lescohier commented on PIG-1385:
---------------------------------------

The Test file in PIG-1385-trunk.patch has a typo: 'org.paache' instead of 'org.apache'.

+++ contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/util/TestToBagToTuple.java	(revision 0)
@@ -0,0 +1,51 @@
+package org.paache.pig.piggybank.util;


> UDF to create tuples and bags
> -----------------------------
>
>                 Key: PIG-1385
>                 URL: https://issues.apache.org/jira/browse/PIG-1385
>             Project: Pig
>          Issue Type: New Feature
>          Components: tools
>    Affects Versions: 0.6.0
>            Reporter: hc busy
>            Assignee: hc busy
>             Fix For: 0.8.0
>
>         Attachments: PIG-1385-trunk.patch
>
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> Based on this conversation:
> > On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
> >
> > > What about making them part of the language using symbols?
> > >
> > > instead of
> > >
> > > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> > >
> > > have language support
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> > >
> > > or even:
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> > >
> > >
> > > Is there reason not to do the second or third other than being more
> > > complicated?
> > >
> > > Certainly I'd volunteer to put the top implementation in to the util
> > > package and submit them for builtin's, but the latter syntactic candies
> > > seems more natural..
> > >
> > >
> > >
> > > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> > >
> > >> The grouping package in piggybank is left over from back when Pig
> > allowed
> > >> users to define grouping functions (0.1).  Functions like these should
> > go in
> > >> evaluation.util.
> > >>
> > >> However, I'd consider putting these in builtin (in main Pig) instead.
> > >>  These are things everyone asks for and they seem like a reasonable
> > addition
> > >> to the core engine.  This will be more of a burden to write (as we'll
> > hold
> > >> them to a higher standard) but of more use to people as well.
> > >>
> > >> Alan.
> > >>
> > >>
> > >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> > >>
> > >>  Some times I wonder... I mean, somebody went to the trouble of making a
> > >>> path
> > >>> called
> > >>>
> > >>> org.apache.pig.piggybank.grouping
> > >>>
> > >>> (where it seems like this code belong), but didn't check in any java
> > code
> > >>> into that package.
> > >>>
> > >>>
> > >>> Any comment about where to put this kind of utility classes?
> > >>>
> > >>>
> > >>>
> > >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> > >>>
> > >>>  2010/4/19 hc busy <hc...@gmail.com>
> > >>>>
> > >>>>  That's just the way it is right now, you can't make bags or tuples
> > >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> > >>>>>
> > >>>>> toBag()
> > >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> > >>>>> TupleToBag(); --some times you need it this way for some reason.
> > >>>>>
> > >>>>>
> > >>>>>  Ok. I place my current code here, may be later I make a patch (if
> > such
> > >>>> implementation is acceptable of course).
> > >>>>
> > >>>> import org.apache.pig.EvalFunc;
> > >>>> import org.apache.pig.data.BagFactory;
> > >>>> import org.apache.pig.data.DataBag;
> > >>>> import org.apache.pig.data.Tuple;
> > >>>> import org.apache.pig.data.TupleFactory;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>>
> > >>>> /**
> > >>>> * Convert any sequence of fields to bag with specified count of
> > >>>> fields<br>
> > >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> > >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> > >>>> *
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBag extends EvalFunc<DataBag> {
> > >>>>  public BagFactory bagFactory;
> > >>>>  public TupleFactory tupleFactory;
> > >>>>
> > >>>>  public ToBag() {
> > >>>>      bagFactory = BagFactory.getInstance();
> > >>>>      tupleFactory = TupleFactory.getInstance();
> > >>>>  }
> > >>>>
> > >>>>  @Override
> > >>>>  public DataBag exec(Tuple input) throws IOException {
> > >>>>      if (input.isNull())
> > >>>>          return null;
> > >>>>      final DataBag bag = bagFactory.newDefaultBag();
> > >>>>      final Integer couter = (Integer) input.get(0);
> > >>>>      if (couter == null)
> > >>>>          return null;
> > >>>>      Tuple tuple = tupleFactory.newTuple();
> > >>>>      for (int i = 0; i < input.size() - 1; i++) {
> > >>>>          if (i % couter == 0) {
> > >>>>              tuple = tupleFactory.newTuple();
> > >>>>              bag.add(tuple);
> > >>>>          }
> > >>>>          tuple.append(input.get(i + 1));
> > >>>>      }
> > >>>>      return bag;
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>> import org.apache.pig.ExecType;
> > >>>> import org.apache.pig.PigServer;
> > >>>> import org.junit.Before;
> > >>>> import org.junit.Test;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>> import java.net.URISyntaxException;
> > >>>> import java.net.URL;
> > >>>>
> > >>>> import static org.junit.Assert.assertTrue;
> > >>>>
> > >>>> /**
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBagTest {
> > >>>>  PigServer pigServer;
> > >>>>  URL inputTxt;
> > >>>>
> > >>>>  @Before
> > >>>>  public void init() throws IOException, URISyntaxException {
> > >>>>      pigServer = new PigServer(ExecType.LOCAL);
> > >>>>      inputTxt =
> > >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> > >>>>  }
> > >>>>
> > >>>>  @Test
> > >>>>  public void testSimple() throws IOException {
> > >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> > +
> > >>>> "' using PigStorage(',') " +
> > >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> > >>>> d:chararray);");
> > >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> > >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> > >>>>
> > >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> > >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> > >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>>
> > >>
> > >
> >

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


[jira] Updated: (PIG-1385) UDF to create tuples and bags

Posted by "hc busy (JIRA)" <ji...@apache.org>.
     [ https://issues.apache.org/jira/browse/PIG-1385?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

hc busy updated PIG-1385:
-------------------------

    Attachment: PIG-1385-trunk.patch

> UDF to create tuples and bags
> -----------------------------
>
>                 Key: PIG-1385
>                 URL: https://issues.apache.org/jira/browse/PIG-1385
>             Project: Pig
>          Issue Type: New Feature
>          Components: tools
>    Affects Versions: 0.6.0
>            Reporter: hc busy
>         Attachments: PIG-1385-trunk.patch
>
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> Based on this conversation:
> > On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
> >
> > > What about making them part of the language using symbols?
> > >
> > > instead of
> > >
> > > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> > >
> > > have language support
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> > >
> > > or even:
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> > >
> > >
> > > Is there reason not to do the second or third other than being more
> > > complicated?
> > >
> > > Certainly I'd volunteer to put the top implementation in to the util
> > > package and submit them for builtin's, but the latter syntactic candies
> > > seems more natural..
> > >
> > >
> > >
> > > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> > >
> > >> The grouping package in piggybank is left over from back when Pig
> > allowed
> > >> users to define grouping functions (0.1).  Functions like these should
> > go in
> > >> evaluation.util.
> > >>
> > >> However, I'd consider putting these in builtin (in main Pig) instead.
> > >>  These are things everyone asks for and they seem like a reasonable
> > addition
> > >> to the core engine.  This will be more of a burden to write (as we'll
> > hold
> > >> them to a higher standard) but of more use to people as well.
> > >>
> > >> Alan.
> > >>
> > >>
> > >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> > >>
> > >>  Some times I wonder... I mean, somebody went to the trouble of making a
> > >>> path
> > >>> called
> > >>>
> > >>> org.apache.pig.piggybank.grouping
> > >>>
> > >>> (where it seems like this code belong), but didn't check in any java
> > code
> > >>> into that package.
> > >>>
> > >>>
> > >>> Any comment about where to put this kind of utility classes?
> > >>>
> > >>>
> > >>>
> > >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> > >>>
> > >>>  2010/4/19 hc busy <hc...@gmail.com>
> > >>>>
> > >>>>  That's just the way it is right now, you can't make bags or tuples
> > >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> > >>>>>
> > >>>>> toBag()
> > >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> > >>>>> TupleToBag(); --some times you need it this way for some reason.
> > >>>>>
> > >>>>>
> > >>>>>  Ok. I place my current code here, may be later I make a patch (if
> > such
> > >>>> implementation is acceptable of course).
> > >>>>
> > >>>> import org.apache.pig.EvalFunc;
> > >>>> import org.apache.pig.data.BagFactory;
> > >>>> import org.apache.pig.data.DataBag;
> > >>>> import org.apache.pig.data.Tuple;
> > >>>> import org.apache.pig.data.TupleFactory;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>>
> > >>>> /**
> > >>>> * Convert any sequence of fields to bag with specified count of
> > >>>> fields<br>
> > >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> > >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> > >>>> *
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBag extends EvalFunc<DataBag> {
> > >>>>  public BagFactory bagFactory;
> > >>>>  public TupleFactory tupleFactory;
> > >>>>
> > >>>>  public ToBag() {
> > >>>>      bagFactory = BagFactory.getInstance();
> > >>>>      tupleFactory = TupleFactory.getInstance();
> > >>>>  }
> > >>>>
> > >>>>  @Override
> > >>>>  public DataBag exec(Tuple input) throws IOException {
> > >>>>      if (input.isNull())
> > >>>>          return null;
> > >>>>      final DataBag bag = bagFactory.newDefaultBag();
> > >>>>      final Integer couter = (Integer) input.get(0);
> > >>>>      if (couter == null)
> > >>>>          return null;
> > >>>>      Tuple tuple = tupleFactory.newTuple();
> > >>>>      for (int i = 0; i < input.size() - 1; i++) {
> > >>>>          if (i % couter == 0) {
> > >>>>              tuple = tupleFactory.newTuple();
> > >>>>              bag.add(tuple);
> > >>>>          }
> > >>>>          tuple.append(input.get(i + 1));
> > >>>>      }
> > >>>>      return bag;
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>> import org.apache.pig.ExecType;
> > >>>> import org.apache.pig.PigServer;
> > >>>> import org.junit.Before;
> > >>>> import org.junit.Test;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>> import java.net.URISyntaxException;
> > >>>> import java.net.URL;
> > >>>>
> > >>>> import static org.junit.Assert.assertTrue;
> > >>>>
> > >>>> /**
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBagTest {
> > >>>>  PigServer pigServer;
> > >>>>  URL inputTxt;
> > >>>>
> > >>>>  @Before
> > >>>>  public void init() throws IOException, URISyntaxException {
> > >>>>      pigServer = new PigServer(ExecType.LOCAL);
> > >>>>      inputTxt =
> > >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> > >>>>  }
> > >>>>
> > >>>>  @Test
> > >>>>  public void testSimple() throws IOException {
> > >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> > +
> > >>>> "' using PigStorage(',') " +
> > >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> > >>>> d:chararray);");
> > >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> > >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> > >>>>
> > >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> > >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> > >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>>
> > >>
> > >
> >

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


[jira] Updated: (PIG-1385) UDF to create tuples and bags

Posted by "hc busy (JIRA)" <ji...@apache.org>.
     [ https://issues.apache.org/jira/browse/PIG-1385?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

hc busy updated PIG-1385:
-------------------------

    Status: Patch Available  (was: Open)

resubmitting patch for the build system to check.

> UDF to create tuples and bags
> -----------------------------
>
>                 Key: PIG-1385
>                 URL: https://issues.apache.org/jira/browse/PIG-1385
>             Project: Pig
>          Issue Type: New Feature
>          Components: tools
>    Affects Versions: 0.6.0
>            Reporter: hc busy
>            Assignee: hc busy
>         Attachments: PIG-1385-trunk.patch
>
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> Based on this conversation:
> > On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
> >
> > > What about making them part of the language using symbols?
> > >
> > > instead of
> > >
> > > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> > >
> > > have language support
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> > >
> > > or even:
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> > >
> > >
> > > Is there reason not to do the second or third other than being more
> > > complicated?
> > >
> > > Certainly I'd volunteer to put the top implementation in to the util
> > > package and submit them for builtin's, but the latter syntactic candies
> > > seems more natural..
> > >
> > >
> > >
> > > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> > >
> > >> The grouping package in piggybank is left over from back when Pig
> > allowed
> > >> users to define grouping functions (0.1).  Functions like these should
> > go in
> > >> evaluation.util.
> > >>
> > >> However, I'd consider putting these in builtin (in main Pig) instead.
> > >>  These are things everyone asks for and they seem like a reasonable
> > addition
> > >> to the core engine.  This will be more of a burden to write (as we'll
> > hold
> > >> them to a higher standard) but of more use to people as well.
> > >>
> > >> Alan.
> > >>
> > >>
> > >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> > >>
> > >>  Some times I wonder... I mean, somebody went to the trouble of making a
> > >>> path
> > >>> called
> > >>>
> > >>> org.apache.pig.piggybank.grouping
> > >>>
> > >>> (where it seems like this code belong), but didn't check in any java
> > code
> > >>> into that package.
> > >>>
> > >>>
> > >>> Any comment about where to put this kind of utility classes?
> > >>>
> > >>>
> > >>>
> > >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> > >>>
> > >>>  2010/4/19 hc busy <hc...@gmail.com>
> > >>>>
> > >>>>  That's just the way it is right now, you can't make bags or tuples
> > >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> > >>>>>
> > >>>>> toBag()
> > >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> > >>>>> TupleToBag(); --some times you need it this way for some reason.
> > >>>>>
> > >>>>>
> > >>>>>  Ok. I place my current code here, may be later I make a patch (if
> > such
> > >>>> implementation is acceptable of course).
> > >>>>
> > >>>> import org.apache.pig.EvalFunc;
> > >>>> import org.apache.pig.data.BagFactory;
> > >>>> import org.apache.pig.data.DataBag;
> > >>>> import org.apache.pig.data.Tuple;
> > >>>> import org.apache.pig.data.TupleFactory;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>>
> > >>>> /**
> > >>>> * Convert any sequence of fields to bag with specified count of
> > >>>> fields<br>
> > >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> > >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> > >>>> *
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBag extends EvalFunc<DataBag> {
> > >>>>  public BagFactory bagFactory;
> > >>>>  public TupleFactory tupleFactory;
> > >>>>
> > >>>>  public ToBag() {
> > >>>>      bagFactory = BagFactory.getInstance();
> > >>>>      tupleFactory = TupleFactory.getInstance();
> > >>>>  }
> > >>>>
> > >>>>  @Override
> > >>>>  public DataBag exec(Tuple input) throws IOException {
> > >>>>      if (input.isNull())
> > >>>>          return null;
> > >>>>      final DataBag bag = bagFactory.newDefaultBag();
> > >>>>      final Integer couter = (Integer) input.get(0);
> > >>>>      if (couter == null)
> > >>>>          return null;
> > >>>>      Tuple tuple = tupleFactory.newTuple();
> > >>>>      for (int i = 0; i < input.size() - 1; i++) {
> > >>>>          if (i % couter == 0) {
> > >>>>              tuple = tupleFactory.newTuple();
> > >>>>              bag.add(tuple);
> > >>>>          }
> > >>>>          tuple.append(input.get(i + 1));
> > >>>>      }
> > >>>>      return bag;
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>> import org.apache.pig.ExecType;
> > >>>> import org.apache.pig.PigServer;
> > >>>> import org.junit.Before;
> > >>>> import org.junit.Test;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>> import java.net.URISyntaxException;
> > >>>> import java.net.URL;
> > >>>>
> > >>>> import static org.junit.Assert.assertTrue;
> > >>>>
> > >>>> /**
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBagTest {
> > >>>>  PigServer pigServer;
> > >>>>  URL inputTxt;
> > >>>>
> > >>>>  @Before
> > >>>>  public void init() throws IOException, URISyntaxException {
> > >>>>      pigServer = new PigServer(ExecType.LOCAL);
> > >>>>      inputTxt =
> > >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> > >>>>  }
> > >>>>
> > >>>>  @Test
> > >>>>  public void testSimple() throws IOException {
> > >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> > +
> > >>>> "' using PigStorage(',') " +
> > >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> > >>>> d:chararray);");
> > >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> > >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> > >>>>
> > >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> > >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> > >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>>
> > >>
> > >
> >

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


[jira] Updated: (PIG-1385) UDF to create tuples and bags

Posted by "hc busy (JIRA)" <ji...@apache.org>.
     [ https://issues.apache.org/jira/browse/PIG-1385?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

hc busy updated PIG-1385:
-------------------------

    Attachment:     (was: PIG-1385-trunk.patch)

> UDF to create tuples and bags
> -----------------------------
>
>                 Key: PIG-1385
>                 URL: https://issues.apache.org/jira/browse/PIG-1385
>             Project: Pig
>          Issue Type: New Feature
>          Components: tools
>    Affects Versions: 0.6.0
>            Reporter: hc busy
>            Assignee: hc busy
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> Based on this conversation:
> > On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
> >
> > > What about making them part of the language using symbols?
> > >
> > > instead of
> > >
> > > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> > >
> > > have language support
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> > >
> > > or even:
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> > >
> > >
> > > Is there reason not to do the second or third other than being more
> > > complicated?
> > >
> > > Certainly I'd volunteer to put the top implementation in to the util
> > > package and submit them for builtin's, but the latter syntactic candies
> > > seems more natural..
> > >
> > >
> > >
> > > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> > >
> > >> The grouping package in piggybank is left over from back when Pig
> > allowed
> > >> users to define grouping functions (0.1).  Functions like these should
> > go in
> > >> evaluation.util.
> > >>
> > >> However, I'd consider putting these in builtin (in main Pig) instead.
> > >>  These are things everyone asks for and they seem like a reasonable
> > addition
> > >> to the core engine.  This will be more of a burden to write (as we'll
> > hold
> > >> them to a higher standard) but of more use to people as well.
> > >>
> > >> Alan.
> > >>
> > >>
> > >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> > >>
> > >>  Some times I wonder... I mean, somebody went to the trouble of making a
> > >>> path
> > >>> called
> > >>>
> > >>> org.apache.pig.piggybank.grouping
> > >>>
> > >>> (where it seems like this code belong), but didn't check in any java
> > code
> > >>> into that package.
> > >>>
> > >>>
> > >>> Any comment about where to put this kind of utility classes?
> > >>>
> > >>>
> > >>>
> > >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> > >>>
> > >>>  2010/4/19 hc busy <hc...@gmail.com>
> > >>>>
> > >>>>  That's just the way it is right now, you can't make bags or tuples
> > >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> > >>>>>
> > >>>>> toBag()
> > >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> > >>>>> TupleToBag(); --some times you need it this way for some reason.
> > >>>>>
> > >>>>>
> > >>>>>  Ok. I place my current code here, may be later I make a patch (if
> > such
> > >>>> implementation is acceptable of course).
> > >>>>
> > >>>> import org.apache.pig.EvalFunc;
> > >>>> import org.apache.pig.data.BagFactory;
> > >>>> import org.apache.pig.data.DataBag;
> > >>>> import org.apache.pig.data.Tuple;
> > >>>> import org.apache.pig.data.TupleFactory;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>>
> > >>>> /**
> > >>>> * Convert any sequence of fields to bag with specified count of
> > >>>> fields<br>
> > >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> > >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> > >>>> *
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBag extends EvalFunc<DataBag> {
> > >>>>  public BagFactory bagFactory;
> > >>>>  public TupleFactory tupleFactory;
> > >>>>
> > >>>>  public ToBag() {
> > >>>>      bagFactory = BagFactory.getInstance();
> > >>>>      tupleFactory = TupleFactory.getInstance();
> > >>>>  }
> > >>>>
> > >>>>  @Override
> > >>>>  public DataBag exec(Tuple input) throws IOException {
> > >>>>      if (input.isNull())
> > >>>>          return null;
> > >>>>      final DataBag bag = bagFactory.newDefaultBag();
> > >>>>      final Integer couter = (Integer) input.get(0);
> > >>>>      if (couter == null)
> > >>>>          return null;
> > >>>>      Tuple tuple = tupleFactory.newTuple();
> > >>>>      for (int i = 0; i < input.size() - 1; i++) {
> > >>>>          if (i % couter == 0) {
> > >>>>              tuple = tupleFactory.newTuple();
> > >>>>              bag.add(tuple);
> > >>>>          }
> > >>>>          tuple.append(input.get(i + 1));
> > >>>>      }
> > >>>>      return bag;
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>> import org.apache.pig.ExecType;
> > >>>> import org.apache.pig.PigServer;
> > >>>> import org.junit.Before;
> > >>>> import org.junit.Test;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>> import java.net.URISyntaxException;
> > >>>> import java.net.URL;
> > >>>>
> > >>>> import static org.junit.Assert.assertTrue;
> > >>>>
> > >>>> /**
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBagTest {
> > >>>>  PigServer pigServer;
> > >>>>  URL inputTxt;
> > >>>>
> > >>>>  @Before
> > >>>>  public void init() throws IOException, URISyntaxException {
> > >>>>      pigServer = new PigServer(ExecType.LOCAL);
> > >>>>      inputTxt =
> > >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> > >>>>  }
> > >>>>
> > >>>>  @Test
> > >>>>  public void testSimple() throws IOException {
> > >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> > +
> > >>>> "' using PigStorage(',') " +
> > >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> > >>>> d:chararray);");
> > >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> > >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> > >>>>
> > >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> > >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> > >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>>
> > >>
> > >
> >

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


[jira] Commented: (PIG-1385) UDF to create tuples and bags

Posted by "Alan Gates (JIRA)" <ji...@apache.org>.
    [ https://issues.apache.org/jira/browse/PIG-1385?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12859568#action_12859568 ] 

Alan Gates commented on PIG-1385:
---------------------------------

Functions look good.  Javadoc on how the functions should be used (what input they expect and what output they'll produce) should be added so that users can see how to use the functions.

> UDF to create tuples and bags
> -----------------------------
>
>                 Key: PIG-1385
>                 URL: https://issues.apache.org/jira/browse/PIG-1385
>             Project: Pig
>          Issue Type: New Feature
>          Components: tools
>    Affects Versions: 0.6.0
>            Reporter: hc busy
>         Attachments: PIG-1385-trunk.patch
>
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> Based on this conversation:
> > On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
> >
> > > What about making them part of the language using symbols?
> > >
> > > instead of
> > >
> > > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> > >
> > > have language support
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> > >
> > > or even:
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> > >
> > >
> > > Is there reason not to do the second or third other than being more
> > > complicated?
> > >
> > > Certainly I'd volunteer to put the top implementation in to the util
> > > package and submit them for builtin's, but the latter syntactic candies
> > > seems more natural..
> > >
> > >
> > >
> > > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> > >
> > >> The grouping package in piggybank is left over from back when Pig
> > allowed
> > >> users to define grouping functions (0.1).  Functions like these should
> > go in
> > >> evaluation.util.
> > >>
> > >> However, I'd consider putting these in builtin (in main Pig) instead.
> > >>  These are things everyone asks for and they seem like a reasonable
> > addition
> > >> to the core engine.  This will be more of a burden to write (as we'll
> > hold
> > >> them to a higher standard) but of more use to people as well.
> > >>
> > >> Alan.
> > >>
> > >>
> > >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> > >>
> > >>  Some times I wonder... I mean, somebody went to the trouble of making a
> > >>> path
> > >>> called
> > >>>
> > >>> org.apache.pig.piggybank.grouping
> > >>>
> > >>> (where it seems like this code belong), but didn't check in any java
> > code
> > >>> into that package.
> > >>>
> > >>>
> > >>> Any comment about where to put this kind of utility classes?
> > >>>
> > >>>
> > >>>
> > >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> > >>>
> > >>>  2010/4/19 hc busy <hc...@gmail.com>
> > >>>>
> > >>>>  That's just the way it is right now, you can't make bags or tuples
> > >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> > >>>>>
> > >>>>> toBag()
> > >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> > >>>>> TupleToBag(); --some times you need it this way for some reason.
> > >>>>>
> > >>>>>
> > >>>>>  Ok. I place my current code here, may be later I make a patch (if
> > such
> > >>>> implementation is acceptable of course).
> > >>>>
> > >>>> import org.apache.pig.EvalFunc;
> > >>>> import org.apache.pig.data.BagFactory;
> > >>>> import org.apache.pig.data.DataBag;
> > >>>> import org.apache.pig.data.Tuple;
> > >>>> import org.apache.pig.data.TupleFactory;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>>
> > >>>> /**
> > >>>> * Convert any sequence of fields to bag with specified count of
> > >>>> fields<br>
> > >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> > >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> > >>>> *
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBag extends EvalFunc<DataBag> {
> > >>>>  public BagFactory bagFactory;
> > >>>>  public TupleFactory tupleFactory;
> > >>>>
> > >>>>  public ToBag() {
> > >>>>      bagFactory = BagFactory.getInstance();
> > >>>>      tupleFactory = TupleFactory.getInstance();
> > >>>>  }
> > >>>>
> > >>>>  @Override
> > >>>>  public DataBag exec(Tuple input) throws IOException {
> > >>>>      if (input.isNull())
> > >>>>          return null;
> > >>>>      final DataBag bag = bagFactory.newDefaultBag();
> > >>>>      final Integer couter = (Integer) input.get(0);
> > >>>>      if (couter == null)
> > >>>>          return null;
> > >>>>      Tuple tuple = tupleFactory.newTuple();
> > >>>>      for (int i = 0; i < input.size() - 1; i++) {
> > >>>>          if (i % couter == 0) {
> > >>>>              tuple = tupleFactory.newTuple();
> > >>>>              bag.add(tuple);
> > >>>>          }
> > >>>>          tuple.append(input.get(i + 1));
> > >>>>      }
> > >>>>      return bag;
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>> import org.apache.pig.ExecType;
> > >>>> import org.apache.pig.PigServer;
> > >>>> import org.junit.Before;
> > >>>> import org.junit.Test;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>> import java.net.URISyntaxException;
> > >>>> import java.net.URL;
> > >>>>
> > >>>> import static org.junit.Assert.assertTrue;
> > >>>>
> > >>>> /**
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBagTest {
> > >>>>  PigServer pigServer;
> > >>>>  URL inputTxt;
> > >>>>
> > >>>>  @Before
> > >>>>  public void init() throws IOException, URISyntaxException {
> > >>>>      pigServer = new PigServer(ExecType.LOCAL);
> > >>>>      inputTxt =
> > >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> > >>>>  }
> > >>>>
> > >>>>  @Test
> > >>>>  public void testSimple() throws IOException {
> > >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> > +
> > >>>> "' using PigStorage(',') " +
> > >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> > >>>> d:chararray);");
> > >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> > >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> > >>>>
> > >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> > >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> > >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>>
> > >>
> > >
> >

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


[jira] Commented: (PIG-1385) UDF to create tuples and bags

Posted by "hc busy (JIRA)" <ji...@apache.org>.
    [ https://issues.apache.org/jira/browse/PIG-1385?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12860488#action_12860488 ] 

hc busy commented on PIG-1385:
------------------------------

yeah! my first contrib. Thanks, Alan!!

> UDF to create tuples and bags
> -----------------------------
>
>                 Key: PIG-1385
>                 URL: https://issues.apache.org/jira/browse/PIG-1385
>             Project: Pig
>          Issue Type: New Feature
>          Components: tools
>    Affects Versions: 0.6.0
>            Reporter: hc busy
>            Assignee: hc busy
>             Fix For: 0.8.0
>
>         Attachments: PIG-1385-trunk.patch
>
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> Based on this conversation:
> > On Tue, Apr 20, 2010 at 6:34 PM, hc busy <hc...@gmail.com> wrote:
> >
> > > What about making them part of the language using symbols?
> > >
> > > instead of
> > >
> > > foreach T generate Tuple($0, $1, $2), Bag($3, $4, $5), $6, $7;
> > >
> > > have language support
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, $6, $7;
> > >
> > > or even:
> > >
> > > foreach T generate ($0, $1, $2), {$3, $4, $5}, [$6#$7, $8#$9], $10, $11;
> > >
> > >
> > > Is there reason not to do the second or third other than being more
> > > complicated?
> > >
> > > Certainly I'd volunteer to put the top implementation in to the util
> > > package and submit them for builtin's, but the latter syntactic candies
> > > seems more natural..
> > >
> > >
> > >
> > > On Tue, Apr 20, 2010 at 5:24 PM, Alan Gates <ga...@yahoo-inc.com> wrote:
> > >
> > >> The grouping package in piggybank is left over from back when Pig
> > allowed
> > >> users to define grouping functions (0.1).  Functions like these should
> > go in
> > >> evaluation.util.
> > >>
> > >> However, I'd consider putting these in builtin (in main Pig) instead.
> > >>  These are things everyone asks for and they seem like a reasonable
> > addition
> > >> to the core engine.  This will be more of a burden to write (as we'll
> > hold
> > >> them to a higher standard) but of more use to people as well.
> > >>
> > >> Alan.
> > >>
> > >>
> > >> On Apr 19, 2010, at 12:53 PM, hc busy wrote:
> > >>
> > >>  Some times I wonder... I mean, somebody went to the trouble of making a
> > >>> path
> > >>> called
> > >>>
> > >>> org.apache.pig.piggybank.grouping
> > >>>
> > >>> (where it seems like this code belong), but didn't check in any java
> > code
> > >>> into that package.
> > >>>
> > >>>
> > >>> Any comment about where to put this kind of utility classes?
> > >>>
> > >>>
> > >>>
> > >>> On Mon, Apr 19, 2010 at 12:07 PM, Andrey S <oc...@gmail.com> wrote:
> > >>>
> > >>>  2010/4/19 hc busy <hc...@gmail.com>
> > >>>>
> > >>>>  That's just the way it is right now, you can't make bags or tuples
> > >>>>> directly... Maybe we should have some UDF's in piggybank for these:
> > >>>>>
> > >>>>> toBag()
> > >>>>> toTuple(); --which is kinda like exec(Tuple in){return in;}
> > >>>>> TupleToBag(); --some times you need it this way for some reason.
> > >>>>>
> > >>>>>
> > >>>>>  Ok. I place my current code here, may be later I make a patch (if
> > such
> > >>>> implementation is acceptable of course).
> > >>>>
> > >>>> import org.apache.pig.EvalFunc;
> > >>>> import org.apache.pig.data.BagFactory;
> > >>>> import org.apache.pig.data.DataBag;
> > >>>> import org.apache.pig.data.Tuple;
> > >>>> import org.apache.pig.data.TupleFactory;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>>
> > >>>> /**
> > >>>> * Convert any sequence of fields to bag with specified count of
> > >>>> fields<br>
> > >>>> * Schema: count:int, fld1 [, fld2, fld3, fld4... ].
> > >>>> * Output: count=2, then { (fld1, fld2) , (fld3, fld4) ... }
> > >>>> *
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBag extends EvalFunc<DataBag> {
> > >>>>  public BagFactory bagFactory;
> > >>>>  public TupleFactory tupleFactory;
> > >>>>
> > >>>>  public ToBag() {
> > >>>>      bagFactory = BagFactory.getInstance();
> > >>>>      tupleFactory = TupleFactory.getInstance();
> > >>>>  }
> > >>>>
> > >>>>  @Override
> > >>>>  public DataBag exec(Tuple input) throws IOException {
> > >>>>      if (input.isNull())
> > >>>>          return null;
> > >>>>      final DataBag bag = bagFactory.newDefaultBag();
> > >>>>      final Integer couter = (Integer) input.get(0);
> > >>>>      if (couter == null)
> > >>>>          return null;
> > >>>>      Tuple tuple = tupleFactory.newTuple();
> > >>>>      for (int i = 0; i < input.size() - 1; i++) {
> > >>>>          if (i % couter == 0) {
> > >>>>              tuple = tupleFactory.newTuple();
> > >>>>              bag.add(tuple);
> > >>>>          }
> > >>>>          tuple.append(input.get(i + 1));
> > >>>>      }
> > >>>>      return bag;
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>> import org.apache.pig.ExecType;
> > >>>> import org.apache.pig.PigServer;
> > >>>> import org.junit.Before;
> > >>>> import org.junit.Test;
> > >>>>
> > >>>> import java.io.IOException;
> > >>>> import java.net.URISyntaxException;
> > >>>> import java.net.URL;
> > >>>>
> > >>>> import static org.junit.Assert.assertTrue;
> > >>>>
> > >>>> /**
> > >>>> * @author astepachev
> > >>>> */
> > >>>> public class ToBagTest {
> > >>>>  PigServer pigServer;
> > >>>>  URL inputTxt;
> > >>>>
> > >>>>  @Before
> > >>>>  public void init() throws IOException, URISyntaxException {
> > >>>>      pigServer = new PigServer(ExecType.LOCAL);
> > >>>>      inputTxt =
> > >>>> this.getClass().getResource("bagTest.txt").toURI().toURL();
> > >>>>  }
> > >>>>
> > >>>>  @Test
> > >>>>  public void testSimple() throws IOException {
> > >>>>      pigServer.registerQuery("a = load '" + inputTxt.toExternalForm()
> > +
> > >>>> "' using PigStorage(',') " +
> > >>>>              "as (id:int, a:chararray, b:chararray, c:chararray,
> > >>>> d:chararray);");
> > >>>>      pigServer.registerQuery("last = foreach a generate flatten(" +
> > >>>> ToBag.class.getName() + "(2, id, a, id, b, id, c));");
> > >>>>
> > >>>>      pigServer.deleteFile("target/pigtest/func1.txt");
> > >>>>      pigServer.store("last", "target/pigtest/func1.txt");
> > >>>>      assertTrue(pigServer.fileSize("target/pigtest/func1.txt") > 0);
> > >>>>  }
> > >>>> }
> > >>>>
> > >>>>
> > >>
> > >
> >

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.