You are viewing a plain text version of this content. The canonical link for it is here.

Posted to user@hive.apache.org by Matthew Bryan <go...@gmail.com> on 2010/04/02 22:11:21 UTC

UDAF on AWS Hive

I'm writing a basic group_concat UDAF for the Amazon version of
Hive....and it's working fine for unordered groupings. But I can't
seem to get an ordered version working (filling an array based on an
IntWritable passed alongside). When I move from using Text return type
on terminatePartial() to either Text[] or a State class I start
getting errors:

FAILED: Error in semantic analysis:
org.apache.hadoop.hive.ql.metadata.HiveException: Cannot recognize
return type class [Lorg.apache.hadoop.io.Text; from public
org.apache.hadoop.io.Text[]
com.company.hadoop.hive.udaf.UDAFGroupConcatN$GroupConcatNStringEvaluator.terminatePartial()

or

FAILED: Error in semantic analysis:
org.apache.hadoop.hive.ql.metadata.HiveException: Cannot recognize
return type class
com.company.hadoop.hive.udaf.UDAFGroupConcatN$UDAFGroupConc
atNState from public
com.company.hadoop.hive.udaf.UDAFGroupConcatN$UDAFGroupConcatNState
com.company.hadoop.hive.udaf.UDAFGroupConcatN$GroupConcatNStringEvaluator.terminatePartial
()

What limits are there on the return type of
terminatePartial()....shouldn't it just have to match the argument of
merge and nothing more? Keep in mind this is the Amazon version of
Hive (0.4 I think)....

I put both versions of the UDAF below, ordered and unordered.

Thanks for your time.

Matt


######### Working Unordered ############
/*QUERY: select user, event, group_concat(details) from datatable
group by user,event;*/

package com.company.hadoop.hive.udaf;

import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
import org.apache.hadoop.io.Text;

public class UDAFGroupConcat extends UDAF{

        public static class GroupConcatStringEvaluator implements
UDAFEvaluator {
                private Text mOutput;
                private boolean mEmpty;

        public GroupConcatStringEvaluator() {
                super();
                init();
        }

        public void init() {
                mOutput = null;
                mEmpty = true;
        }

        public boolean iterate(Text o) {
                if (o!=null) {
                        if(mEmpty) {
                                mOutput = new Text(o);
                                mEmpty = false;
                        } else {
                                mOutput.set(mOutput.toString()+"
"+o.toString());
                        }
                }
                return true;
        }
        public Text terminatePartial() {return mEmpty ? null : mOutput;}
        public boolean merge(Text o) {return iterate(o);}
        public Text terminate() {return mEmpty ? null : mOutput;}
}
}

############ Not Working Ordered #############
/*QUERY: select user, event, group_concatN(details, detail_id) from
datatable group by user,event;*/

package com.company.hadoop.hive.udaf;

import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;

public class UDAFGroupConcatN extends UDAF{

        public static class GroupConcatNStringEvaluator implements
UDAFEvaluator {

                private Text[] mArray;
                private boolean mEmpty;

                public GroupConcatNStringEvaluator() {
                        super();
                        init();
                }

        public void init() {
                mArray = new Text[5];
                mEmpty = true;
        }

        public boolean iterate(Text o, IntWritable N) {
                if (o!=null&&N!=null) {
                        mArray[N.get()].set(o.toString());
                        mEmpty=false;
                }
                return true;
        }
        public Text[] terminatePartial() {return mEmpty ? null : mArray;}
        public boolean merge(Text[] o) {
                if (o!=null) {
                        for(int i=0; i<=5; i++){
                                if(mArray[i].getLength()==0){
                                        mArray[i].set(o[i].toString());
                                }
                        }
                }
                return true;
        }

        public Text[] terminate() {return mEmpty ? null : mArray;}
}
}

Re: UDAF on AWS Hive

Posted by Matthew Bryan <go...@gmail.com>.

Thanks Zheng, and thanks for your great support to this list. I took
your idea and wrote the following code that worked for me...I'm no
Java whiz...so it's probably fairly inefficient. I do get to talk to
the Amazon folks from time to time, so I'll definitely mention my
interest in upgrading the Hive version. Thanks again.

Matt

package com.company.hadoop.hive.udaf;

import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import java.util.Arrays;

public class UDAFGroupConcat extends UDAF{

        public static class GroupConcatStringEvaluator implements
UDAFEvaluator {
                private Text mOutput;
                private boolean mEmpty;

        public GroupConcatStringEvaluator() {
                super();
                init();
        }

        public void init() {
                mOutput = null;
                mEmpty = true;
        }
        public boolean iterate(Text o,  IntWritable N) {
                if (o!=null) {
                        if(mEmpty) {
                                mOutput = new Text(N+" "+o.toString());
                                mEmpty = false;
                        } else {
                                String temp = mOutput.toString() +
"\t" + N + " " + o.toString();
                                String[] split = temp.split("\t");
                                Arrays.sort(split);
                                String sorted = split[0];
                                for (int i = 1; i < split.length; i++)
                                {
                                        sorted = sorted + "\t" + split[i];
                                }
                                mOutput.set(sorted);
                        }
                }
                return true;
        }
        public Text terminatePartial() {return mEmpty ? null : mOutput;}
        public boolean merge(Text o) {
                if (o!=null) {
                        if(mEmpty) {
                            mOutput = new Text(o.toString());
                            mEmpty = false;
                        } else {
                                String temp = mOutput.toString() +
"\t" + o.toString();
                                String[] split = temp.split("\t");
                                Arrays.sort(split);
                                String sorted = split[0];
                                for (int i = 1; i < split.length; i++)
                                {
                                        sorted = sorted + "\t" + split[i];
                                }
                                mOutput.set(sorted);
                        }
                }
                return true;
        }
        public Text terminate() {return mEmpty ? null : mOutput;}
}
}


On Fri, Apr 2, 2010 at 4:11 PM, Matthew Bryan <go...@gmail.com> wrote:
> I'm writing a basic group_concat UDAF for the Amazon version of
> Hive....and it's working fine for unordered groupings. But I can't
> seem to get an ordered version working (filling an array based on an
> IntWritable passed alongside). When I move from using Text return type
> on terminatePartial() to either Text[] or a State class I start
> getting errors:
>
> FAILED: Error in semantic analysis:
> org.apache.hadoop.hive.ql.metadata.HiveException: Cannot recognize
> return type class [Lorg.apache.hadoop.io.Text; from public
> org.apache.hadoop.io.Text[]
> com.company.hadoop.hive.udaf.UDAFGroupConcatN$GroupConcatNStringEvaluator.terminatePartial()
>
> or
>
> FAILED: Error in semantic analysis:
> org.apache.hadoop.hive.ql.metadata.HiveException: Cannot recognize
> return type class
> com.company.hadoop.hive.udaf.UDAFGroupConcatN$UDAFGroupConc
> atNState from public
> com.company.hadoop.hive.udaf.UDAFGroupConcatN$UDAFGroupConcatNState
> com.company.hadoop.hive.udaf.UDAFGroupConcatN$GroupConcatNStringEvaluator.terminatePartial
> ()
>
> What limits are there on the return type of
> terminatePartial()....shouldn't it just have to match the argument of
> merge and nothing more? Keep in mind this is the Amazon version of
> Hive (0.4 I think)....
>
> I put both versions of the UDAF below, ordered and unordered.
>
> Thanks for your time.
>
> Matt
>
>
> ######### Working Unordered ############
> /*QUERY: select user, event, group_concat(details) from datatable
> group by user,event;*/
>
> package com.company.hadoop.hive.udaf;
>
> import org.apache.hadoop.hive.ql.exec.UDAF;
> import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
> import org.apache.hadoop.io.Text;
>
> public class UDAFGroupConcat extends UDAF{
>
>        public static class GroupConcatStringEvaluator implements
> UDAFEvaluator {
>                private Text mOutput;
>                private boolean mEmpty;
>
>        public GroupConcatStringEvaluator() {
>                super();
>                init();
>        }
>
>        public void init() {
>                mOutput = null;
>                mEmpty = true;
>        }
>
>        public boolean iterate(Text o) {
>                if (o!=null) {
>                        if(mEmpty) {
>                                mOutput = new Text(o);
>                                mEmpty = false;
>                        } else {
>                                mOutput.set(mOutput.toString()+"
> "+o.toString());
>                        }
>                }
>                return true;
>        }
>        public Text terminatePartial() {return mEmpty ? null : mOutput;}
>        public boolean merge(Text o) {return iterate(o);}
>        public Text terminate() {return mEmpty ? null : mOutput;}
> }
> }
>
> ############ Not Working Ordered #############
> /*QUERY: select user, event, group_concatN(details, detail_id) from
> datatable group by user,event;*/
>
> package com.company.hadoop.hive.udaf;
>
> import org.apache.hadoop.hive.ql.exec.UDAF;
> import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
> import org.apache.hadoop.io.Text;
> import org.apache.hadoop.io.IntWritable;
>
> public class UDAFGroupConcatN extends UDAF{
>
>        public static class GroupConcatNStringEvaluator implements
> UDAFEvaluator {
>
>                private Text[] mArray;
>                private boolean mEmpty;
>
>                public GroupConcatNStringEvaluator() {
>                        super();
>                        init();
>                }
>
>        public void init() {
>                mArray = new Text[5];
>                mEmpty = true;
>        }
>
>        public boolean iterate(Text o, IntWritable N) {
>                if (o!=null&&N!=null) {
>                        mArray[N.get()].set(o.toString());
>                        mEmpty=false;
>                }
>                return true;
>        }
>        public Text[] terminatePartial() {return mEmpty ? null : mArray;}
>        public boolean merge(Text[] o) {
>                if (o!=null) {
>                        for(int i=0; i<=5; i++){
>                                if(mArray[i].getLength()==0){
>                                        mArray[i].set(o[i].toString());
>                                }
>                        }
>                }
>                return true;
>        }
>
>        public Text[] terminate() {return mEmpty ? null : mArray;}
> }
> }
>

Re: UDAF on AWS Hive

Posted by Zheng Shao <zs...@gmail.com>.

Hive 0.4 has limited support on complex types in UDAF.
If you are looking for an ad-hoc solution, try putting the data into a
single Text.

It will be great if you can ask AWS guys upgrading Hive to 0.5.
0.5 has over 100 bug fixes and is much more stable.

Zheng

On Fri, Apr 2, 2010 at 1:11 PM, Matthew Bryan <go...@gmail.com> wrote:
> I'm writing a basic group_concat UDAF for the Amazon version of
> Hive....and it's working fine for unordered groupings. But I can't
> seem to get an ordered version working (filling an array based on an
> IntWritable passed alongside). When I move from using Text return type
> on terminatePartial() to either Text[] or a State class I start
> getting errors:
>
> FAILED: Error in semantic analysis:
> org.apache.hadoop.hive.ql.metadata.HiveException: Cannot recognize
> return type class [Lorg.apache.hadoop.io.Text; from public
> org.apache.hadoop.io.Text[]
> com.company.hadoop.hive.udaf.UDAFGroupConcatN$GroupConcatNStringEvaluator.terminatePartial()
>
> or
>
> FAILED: Error in semantic analysis:
> org.apache.hadoop.hive.ql.metadata.HiveException: Cannot recognize
> return type class
> com.company.hadoop.hive.udaf.UDAFGroupConcatN$UDAFGroupConc
> atNState from public
> com.company.hadoop.hive.udaf.UDAFGroupConcatN$UDAFGroupConcatNState
> com.company.hadoop.hive.udaf.UDAFGroupConcatN$GroupConcatNStringEvaluator.terminatePartial
> ()
>
> What limits are there on the return type of
> terminatePartial()....shouldn't it just have to match the argument of
> merge and nothing more? Keep in mind this is the Amazon version of
> Hive (0.4 I think)....
>
> I put both versions of the UDAF below, ordered and unordered.
>
> Thanks for your time.
>
> Matt
>
>
> ######### Working Unordered ############
> /*QUERY: select user, event, group_concat(details) from datatable
> group by user,event;*/
>
> package com.company.hadoop.hive.udaf;
>
> import org.apache.hadoop.hive.ql.exec.UDAF;
> import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
> import org.apache.hadoop.io.Text;
>
> public class UDAFGroupConcat extends UDAF{
>
>        public static class GroupConcatStringEvaluator implements
> UDAFEvaluator {
>                private Text mOutput;
>                private boolean mEmpty;
>
>        public GroupConcatStringEvaluator() {
>                super();
>                init();
>        }
>
>        public void init() {
>                mOutput = null;
>                mEmpty = true;
>        }
>
>        public boolean iterate(Text o) {
>                if (o!=null) {
>                        if(mEmpty) {
>                                mOutput = new Text(o);
>                                mEmpty = false;
>                        } else {
>                                mOutput.set(mOutput.toString()+"
> "+o.toString());
>                        }
>                }
>                return true;
>        }
>        public Text terminatePartial() {return mEmpty ? null : mOutput;}
>        public boolean merge(Text o) {return iterate(o);}
>        public Text terminate() {return mEmpty ? null : mOutput;}
> }
> }
>
> ############ Not Working Ordered #############
> /*QUERY: select user, event, group_concatN(details, detail_id) from
> datatable group by user,event;*/
>
> package com.company.hadoop.hive.udaf;
>
> import org.apache.hadoop.hive.ql.exec.UDAF;
> import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
> import org.apache.hadoop.io.Text;
> import org.apache.hadoop.io.IntWritable;
>
> public class UDAFGroupConcatN extends UDAF{
>
>        public static class GroupConcatNStringEvaluator implements
> UDAFEvaluator {
>
>                private Text[] mArray;
>                private boolean mEmpty;
>
>                public GroupConcatNStringEvaluator() {
>                        super();
>                        init();
>                }
>
>        public void init() {
>                mArray = new Text[5];
>                mEmpty = true;
>        }
>
>        public boolean iterate(Text o, IntWritable N) {
>                if (o!=null&&N!=null) {
>                        mArray[N.get()].set(o.toString());
>                        mEmpty=false;
>                }
>                return true;
>        }
>        public Text[] terminatePartial() {return mEmpty ? null : mArray;}
>        public boolean merge(Text[] o) {
>                if (o!=null) {
>                        for(int i=0; i<=5; i++){
>                                if(mArray[i].getLength()==0){
>                                        mArray[i].set(o[i].toString());
>                                }
>                        }
>                }
>                return true;
>        }
>
>        public Text[] terminate() {return mEmpty ? null : mArray;}
> }
> }
>



-- 
Yours,
Zheng