You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@pig.apache.org by sonia gehlot <so...@gmail.com> on 2011/05/12 19:43:25 UTC

java.lang.OutOfMemoryError while running Pig Job

Hi Guys,

I am running following Pig script in Pig 0.8 version

page_events = LOAD '/user/sgehlot/day=2011-05-10' as
(event_dt_ht:chararray,event_dt_ut:chararray,event_rec_num:int,event_type:int,
client_ip_addr:long,hub_id:int,is_cookied_user:int,local_ontology_node_id:int,
page_type_id:int,content_id:int,product_id:int,referrer_edition_id:int,page_number:int,is_iab_robot:int,browser_id:int,os_id:int,dw_pubsys_id:int,refresh:int,asset_id:int,asset_type_id:int,content_type_id:int,product_type_id:int,outbound_email_id:long,gbal_clc:int,mtype:int,user_action_id:int,referring_partner_id:int,ontology_node_id:int,content_namespace_id:int,product_namespace_id:int,transparent_edition_id:int,default_edition_id:int,event_seq_num:int,is_last_page:int,is_new_user:int,page_duration:int,page_seq_num:int,session_id:long,time_since_sess_start:int,reg_cookie:chararray,urs_app_id:int,is_reg_user:int,edition_id:int,user_agent_id:int,page_type_key:int,referrer_id:int,channel_id:int,level2_id:int,level3_id:int,brand_id:int,content_key:int,product_key:int,edition_key:int,partner_key:int,business_unit_id:int,anon_cookie:chararray,machine_name:chararray,pagehost:chararray,filenameextension:chararray,referrerpath:chararray,referrerhost:chararray,referring_oid:chararray,referring_legacy_oid:chararray,ctype:chararray,cval:chararray,link_tag:chararray,link_type:chararray,sticky_tag:chararray,page_url:chararray,search_category:chararray,partner_subject:chararray,referring_partner_name:chararray,robot_pattern:chararray,browser:chararray,browser_major_version:chararray,browser_minor_version:chararray,os:chararray,os_family:chararray,ttag:chararray,dest_oid:chararray,global_id:chararray,hostname:chararray,path:chararray,filename:chararray,extension:chararray,query:chararray,user_agent:chararray,xrq:chararray,xref:chararray,page_guid:chararray,test_name:chararray,test_group:chararray,test_version:chararray,page_version:chararray,o_sticky_tag:chararray,new_referring_oid:chararray,day:chararray,network_ip:int,site_id:int,search_phrase:chararray,search_attributes:chararray,web_search_phrase:chararray,ip_address:chararray,is_pattern_match_robot:int,protocol:chararray,skc_title:chararray,skc_url:chararray,has_site_search_phrase:int,has_site_search_attribs:int,has_web_search_phrase:int,title_id:chararray,url_id:chararray,network_rev:int);

referrer_group_map = LOAD '/user/sgehlot/oozie/db_data/referrer_group_map'
as
(referrer_id:int, has_web_search_phrase:int, hostname:chararray,
referral_type_id:int,
referral_type_name:chararray,
referrer_group_id:int,referrer_group_name:chararray,referrer_group_cat_id:int,referrer_group_cat:chararray);

filter_pe = FILTER page_events BY is_iab_robot == 0 AND
is_pattern_match_robot == 0 AND day == '2011-05-10';

select_pe_col = FOREACH filter_pe GENERATE day, is_cookied_user,
anon_cookie, reg_cookie, referrer_id, has_web_search_phrase,
business_unit_id;

select_ref_col = FOREACH referrer_group_map GENERATE referrer_id,
has_web_search_phrase, referral_type_id;

jn = JOIN select_ref_col BY (referrer_id, has_web_search_phrase),
select_pe_col BY (referrer_id, has_web_search_phrase);


logic = FOREACH jn GENERATE
select_pe_col::day,
select_ref_col::referral_type_id,
 select_pe_col::business_unit_id,
((select_pe_col::is_cookied_user == 1) ? select_pe_col::anon_cookie : null)
as c_users,
 ((select_pe_col::is_cookied_user == 0) ? select_pe_col::anon_cookie : null)
as nc_users,
((select_pe_col::reg_cookie == '-1') ? null : select_pe_col::reg_cookie) as
registered_users;
 group_it = GROUP logic BY (select_pe_col::day,
   select_ref_col::referral_type_id,
   select_pe_col::business_unit_id);

agg_results = FOREACH group_it{
 dst_c_users = DISTINCT logic.c_users;
dst_nc_users = DISTINCT logic.nc_users;
 dst_registered_users = DISTINCT logic.registered_users;
      GENERATE
       group.select_pe_col::day,
        group.select_ref_col::referral_type_id,
group.select_pe_col::business_unit_id,
 COUNT(dst_c_users) as c_users,
COUNT(dst_nc_users) as nc_users,
COUNT(dst_registered_users) as registered_users;
 };

STORE agg_results INTO '/user/sgehlot/pt_users_referral_type_bu_day' USING
PigStorage('\t');


But I am keep on failing because of Java out of memory error. Data set in
"page_events" is huge but in "referrer_group_map" is relatively very less.

Here is error message:

Error: java.lang.OutOfMemoryError: GC overhead limit exceeded
at java.util.ArrayList.(ArrayList.java:112)
 at java.util.ArrayList.(ArrayList.java:119)
at org.apache.pig.data.DefaultTuple.(DefaultTuple.java:59)
 at org.apache.pig.data.BinSedesTuple.(BinSedesTuple.java:73)
at
org.apache.pig.data.BinSedesTupleFactory.newTuple(BinSedesTupleFactory.java:33)
 at
org.apache.pig.data.InternalCachedBag$CachedBagIterator.hasNext(InternalCachedBag.java:236)
at
org.apache.pig.builtin.Distinct.getDistinctFromNestedBags(Distinct.java:136)
 at org.apache.pig.builtin.Distinct.access$200(Distinct.java:38)
at org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:101)
 at org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:94)
at
org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POUserFunc.getNext(POUserFunc.java:216)
 at
org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.POUserFunc.getNext(POUserFunc.java:253)
at
org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.getNext(PhysicalOperator.java:334)
 at
org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POForEach.processPlan(POForEach.java:332)
at
org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POForEach.getNext(POForEach.java:284)
 at
org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.processInput(PhysicalOperator.java:290)
at
org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLocalRearrange.getNext(POLocalRearrange.java:256)
 at
org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combine.processOnePackageOutput(PigCombiner.java:184)
at
org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combine.reduce(PigCombiner.java:162)
 at
org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combine.reduce(PigCombiner.java:51)
at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
 at org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
at
org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:1265)
 at
org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
at
org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:1173)

Error: java.lang.OutOfMemoryError: Java heap space
at java.util.ArrayList.(ArrayList.java:112)
 at java.util.ArrayList.(ArrayList.java:119)
at org.apache.pig.data.DefaultDataBag.(DefaultDataBag.java:54)
 at
org.apache.pig.data.DefaultBagFactory.newDefaultBag(DefaultBagFactory.java:33)
at org.apache.pig.data.BinInterSedes.readBag(BinInterSedes.java:143)
 at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:275)
at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
 at org.apache.pig.data.BinInterSedes.readTuple(BinInterSedes.java:111)
at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:270)
 at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
at org.apache.pig.data.BinInterSedes.addColsToTuple(BinInterSedes.java:555)
 at org.apache.pig.data.BinSedesTuple.readFields(BinSedesTuple.java:64)
at
org.apache.pig.impl.io.PigNullableWritable.readFields(PigNullableWritable.java:114)
 at
org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:67)
at
org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:40)
 at
org.apache.hadoop.mapreduce.ReduceContext.nextKeyValue(ReduceContext.java:116)
at
org.apache.hadoop.mapreduce.ReduceContext$ValueIterator.next(ReduceContext.java:163)
 at
org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POCombinerPackage.getNext(POCombinerPackage.java:141)
at
org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combine.processOnePackageOutput(PigCombiner.java:171)
 at
org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combine.reduce(PigCombiner.java:162)
at
org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combine.reduce(PigCombiner.java:51)
 at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
at org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
 at
org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:1265)
at
org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
 at
org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:1173)

Any idea and suggestion what could be cause of error message?

Thanks for any help,
Sonia

Re: java.lang.OutOfMemoryError while running Pig Job

Posted by sonia gehlot <so...@gmail.com>.
Hi All,

This script worked for me by setting following condition

*set pig.exec.nocombiner true;*

Thanks for your help.

Sonia

On Mon, May 23, 2011 at 3:03 PM, Dmitriy Ryaboy <dv...@gmail.com> wrote:

> you can group by your key + the thing you want distinct counts of, and
> generate counts of those.
>
> On Mon, May 23, 2011 at 2:17 PM, sonia gehlot <so...@gmail.com>
> wrote:
> > Hi Shawn,
> >
> > I tried using SUBSTRING in my script with different combinations but
> still
> > getting OOM errors.
> >
> > is there is any other alternative to use distinct - count against very
> large
> > set of data.
> >
> > Thanks,
> > Sonia
> >
> > On Fri, May 20, 2011 at 1:54 PM, Xiaomeng Wan <sh...@gmail.com>
> wrote:
> >
> >> It servers two purposes:
> >> 1. divide the group into smaller subgroups
> >> 2. make sure distinct in subgroup => distinct in group
> >>
> >> Shawn
> >>
> >> On Fri, May 20, 2011 at 2:20 PM, sonia gehlot <so...@gmail.com>
> >> wrote:
> >> > Hey, I am sorry but I din't get how substring will help in this?
> >> >
> >> > On Fri, May 20, 2011 at 1:08 PM, Xiaomeng Wan <sh...@gmail.com>
> >> wrote:
> >> >
> >> >> you can try using some divide and conquer, like this:
> >> >>
> >> >> a = group data by (key, SUBSTRING(the_field_to_be_distinct, 0, 2));
> >> >> b = foreach a { x = distinct a.he_field_to_be_distinct; generate
> >> >> group.key as key, COUNT(x) as cnt; }
> >> >> c = group b by key;
> >> >> d = foreach c generate group as key, SUM(b.cnt) as cnt;
> >> >>
> >> >> using longer substring if still running into OOM.
> >> >>
> >> >> Regards,
> >> >> Shawn
> >> >>
> >> >> On Fri, May 20, 2011 at 1:11 PM, sonia gehlot <
> sonia.gehlot@gmail.com>
> >> >> wrote:
> >> >> > Hey Thejas,
> >> >> >
> >> >> > I tried setting up property pig.cachedbag.memusage to 0.1 and also
> >> tried
> >> >> > computing distinct count for each type separately but still I am
> >> getting
> >> >> > errors like
> >> >> >
> >> >> > Error: java.lang.OutOfMemoryError: Java heap space
> >> >> > Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
> >> >> > java.io.IOException: Spill failed
> >> >> >
> >> >> > Is there is some other way to do distinct - count that you may
> >> suggest?
> >> >> >
> >> >> > Thanks for your help.
> >> >> >
> >> >> > Sonia
> >> >> >
> >> >> > On Fri, May 13, 2011 at 4:46 PM, Thejas M Nair <
> tejas@yahoo-inc.com>
> >> >> wrote:
> >> >> >
> >> >> >> The stack trace shows that the OOM error is happening when the
> >> distinct
> >> >> is
> >> >> >> being applied. It looks like in some record(s) of the relation
> >> group_it,
> >> >> >> one
> >> >> >> more of the following bags is very large - logic.c_users,
> >> >>  logic.nc_users
> >> >> >> or
> >> >> >> logic.registered_users;
> >> >> >>
> >> >> >> Try setting the property pig.cachedbag.memusage to 0.1 or lower (
> >> >> >> -Dpig.cachedbag.memusage=0.1 on java command line). It controls
> the
> >> >> memory
> >> >> >> used by pig internal bags, including those used by distinct.
> >> >> >>
> >> >> >> If that does not work, you can try computing count-distinct for
> each
> >> >> type
> >> >> >> of
> >> >> >> user separately and then combining the result.
> >> >> >>
> >> >> >>
> >> >> >> You might want to have a look at this way of optimizing
> >> count-distinct
> >> >> >> queries where skew can be a problem -
> >> >> >> https://issues.apache.org/jira/browse/PIG-1846
> >> >> >>
> >> >> >> -thejas
> >> >> >>
> >> >> >>
> >> >> >>
> >> >> >>
> >> >> >>
> >> >> >> On 5/12/11 10:43 AM, "sonia gehlot" <so...@gmail.com>
> wrote:
> >> >> >>
> >> >> >> > Hi Guys,
> >> >> >> >
> >> >> >> > I am running following Pig script in Pig 0.8 version
> >> >> >> >
> >> >> >> > page_events = LOAD '/user/sgehlot/day=2011-05-10' as
> >> >> >> >
> >> >> >>
> >> >>
> >>
> (event_dt_ht:chararray,event_dt_ut:chararray,event_rec_num:int,event_type:int,
> >> >> >> >
> >> >> >>
> >> >>
> >>
> client_ip_addr:long,hub_id:int,is_cookied_user:int,local_ontology_node_id:int,
> >> >> >> >
> >> >> >>
> >> >>
> >>
> page_type_id:int,content_id:int,product_id:int,referrer_edition_id:int,page_nu
> >> >> >> >
> >> >> >>
> >> >>
> >>
> mber:int,is_iab_robot:int,browser_id:int,os_id:int,dw_pubsys_id:int,refresh:in
> >> >> >> >
> >> >> >>
> >> >>
> >>
> t,asset_id:int,asset_type_id:int,content_type_id:int,product_type_id:int,outbo
> >> >> >> >
> >> >> >>
> >> >>
> >>
> und_email_id:long,gbal_clc:int,mtype:int,user_action_id:int,referring_partner_
> >> >> >> >
> >> >> >>
> >> >>
> >>
> id:int,ontology_node_id:int,content_namespace_id:int,product_namespace_id:int,
> >> >> >> >
> >> >> >>
> >> >>
> >>
> transparent_edition_id:int,default_edition_id:int,event_seq_num:int,is_last_pa
> >> >> >> >
> >> >> >>
> >> >>
> >>
> ge:int,is_new_user:int,page_duration:int,page_seq_num:int,session_id:long,time
> >> >> >> >
> >> >> >>
> >> >>
> >>
> _since_sess_start:int,reg_cookie:chararray,urs_app_id:int,is_reg_user:int,edit
> >> >> >> >
> >> >> >>
> >> >>
> >>
> ion_id:int,user_agent_id:int,page_type_key:int,referrer_id:int,channel_id:int,
> >> >> >> >
> >> >> >>
> >> >>
> >>
> level2_id:int,level3_id:int,brand_id:int,content_key:int,product_key:int,editi
> >> >> >> >
> >> >> >>
> >> >>
> >>
> on_key:int,partner_key:int,business_unit_id:int,anon_cookie:chararray,machine_
> >> >> >> >
> >> >> >>
> >> >>
> >>
> name:chararray,pagehost:chararray,filenameextension:chararray,referrerpath:cha
> >> >> >> >
> >> >> >>
> >> >>
> >>
> rarray,referrerhost:chararray,referring_oid:chararray,referring_legacy_oid:cha
> >> >> >> >
> >> >> >>
> >> >>
> >>
> rarray,ctype:chararray,cval:chararray,link_tag:chararray,link_type:chararray,s
> >> >> >> >
> >> >> >>
> >> >>
> >>
> ticky_tag:chararray,page_url:chararray,search_category:chararray,partner_subje
> >> >> >> >
> >> >> >>
> >> >>
> >>
> ct:chararray,referring_partner_name:chararray,robot_pattern:chararray,browser:
> >> >> >> >
> >> >> >>
> >> >>
> >>
> chararray,browser_major_version:chararray,browser_minor_version:chararray,os:c
> >> >> >> >
> >> >> >>
> >> >>
> >>
> hararray,os_family:chararray,ttag:chararray,dest_oid:chararray,global_id:chara
> >> >> >> >
> >> >> >>
> >> >>
> >>
> rray,hostname:chararray,path:chararray,filename:chararray,extension:chararray,
> >> >> >> >
> >> >> >>
> >> >>
> >>
> query:chararray,user_agent:chararray,xrq:chararray,xref:chararray,page_guid:ch
> >> >> >> >
> >> >> >>
> >> >>
> >>
> ararray,test_name:chararray,test_group:chararray,test_version:chararray,page_v
> >> >> >> >
> >> >> >>
> >> >>
> >>
> ersion:chararray,o_sticky_tag:chararray,new_referring_oid:chararray,day:charar
> >> >> >> >
> >> >> >>
> >> >>
> >>
> ray,network_ip:int,site_id:int,search_phrase:chararray,search_attributes:chara
> >> >> >> >
> >> >> >>
> >> >>
> >>
> rray,web_search_phrase:chararray,ip_address:chararray,is_pattern_match_robot:i
> >> >> >> >
> >> >> >>
> >> >>
> >>
> nt,protocol:chararray,skc_title:chararray,skc_url:chararray,has_site_search_ph
> >> >> >> >
> >> >> >>
> >> >>
> >>
> rase:int,has_site_search_attribs:int,has_web_search_phrase:int,title_id:charar
> >> >> >> > ray,url_id:chararray,network_rev:int);
> >> >> >> >
> >> >> >> > referrer_group_map = LOAD
> >> >> >> '/user/sgehlot/oozie/db_data/referrer_group_map'
> >> >> >> > as
> >> >> >> > (referrer_id:int, has_web_search_phrase:int, hostname:chararray,
> >> >> >> > referral_type_id:int,
> >> >> >> > referral_type_name:chararray,
> >> >> >> >
> >> >> >>
> >> >>
> >>
> referrer_group_id:int,referrer_group_name:chararray,referrer_group_cat_id:int,
> >> >> >> > referrer_group_cat:chararray);
> >> >> >> >
> >> >> >> > filter_pe = FILTER page_events BY is_iab_robot == 0 AND
> >> >> >> > is_pattern_match_robot == 0 AND day == '2011-05-10';
> >> >> >> >
> >> >> >> > select_pe_col = FOREACH filter_pe GENERATE day, is_cookied_user,
> >> >> >> > anon_cookie, reg_cookie, referrer_id, has_web_search_phrase,
> >> >> >> > business_unit_id;
> >> >> >> >
> >> >> >> > select_ref_col = FOREACH referrer_group_map GENERATE
> referrer_id,
> >> >> >> > has_web_search_phrase, referral_type_id;
> >> >> >> >
> >> >> >> > jn = JOIN select_ref_col BY (referrer_id,
> has_web_search_phrase),
> >> >> >> > select_pe_col BY (referrer_id, has_web_search_phrase);
> >> >> >> >
> >> >> >> >
> >> >> >> > logic = FOREACH jn GENERATE
> >> >> >> > select_pe_col::day,
> >> >> >> > select_ref_col::referral_type_id,
> >> >> >> >  select_pe_col::business_unit_id,
> >> >> >> > ((select_pe_col::is_cookied_user == 1) ?
> select_pe_col::anon_cookie
> >> :
> >> >> >> null)
> >> >> >> > as c_users,
> >> >> >> >  ((select_pe_col::is_cookied_user == 0) ?
> >> select_pe_col::anon_cookie :
> >> >> >> null)
> >> >> >> > as nc_users,
> >> >> >> > ((select_pe_col::reg_cookie == '-1') ? null :
> >> >> select_pe_col::reg_cookie)
> >> >> >> as
> >> >> >> > registered_users;
> >> >> >> >  group_it = GROUP logic BY (select_pe_col::day,
> >> >> >> >    select_ref_col::referral_type_id,
> >> >> >> >    select_pe_col::business_unit_id);
> >> >> >> >
> >> >> >> > agg_results = FOREACH group_it{
> >> >> >> >  dst_c_users = DISTINCT logic.c_users;
> >> >> >> > dst_nc_users = DISTINCT logic.nc_users;
> >> >> >> >  dst_registered_users = DISTINCT logic.registered_users;
> >> >> >> >       GENERATE
> >> >> >> >        group.select_pe_col::day,
> >> >> >> >         group.select_ref_col::referral_type_id,
> >> >> >> > group.select_pe_col::business_unit_id,
> >> >> >> >  COUNT(dst_c_users) as c_users,
> >> >> >> > COUNT(dst_nc_users) as nc_users,
> >> >> >> > COUNT(dst_registered_users) as registered_users;
> >> >> >> >  };
> >> >> >> >
> >> >> >> > STORE agg_results INTO
> >> '/user/sgehlot/pt_users_referral_type_bu_day'
> >> >> >> USING
> >> >> >> > PigStorage('\t');
> >> >> >> >
> >> >> >> >
> >> >> >> > But I am keep on failing because of Java out of memory error.
> Data
> >> set
> >> >> in
> >> >> >> > "page_events" is huge but in "referrer_group_map" is relatively
> >> very
> >> >> >> less.
> >> >> >> >
> >> >> >> > Here is error message:
> >> >> >> >
> >> >> >> > Error: java.lang.OutOfMemoryError: GC overhead limit exceeded
> >> >> >> > at java.util.ArrayList.(ArrayList.java:112)
> >> >> >> >  at java.util.ArrayList.(ArrayList.java:119)
> >> >> >> > at org.apache.pig.data.DefaultTuple.(DefaultTuple.java:59)
> >> >> >> >  at org.apache.pig.data.BinSedesTuple.(BinSedesTuple.java:73)
> >> >> >> > at
> >> >> >> >
> >> >> >>
> >> >> >>
> >> >>
> >>
> org.apache.pig.data.BinSedesTupleFactory.newTuple(BinSedesTupleFactory.java:33>
> >> >> >> )
> >> >> >> >  at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.data.InternalCachedBag$CachedBagIterator.hasNext(InternalCached
> >> >> >> > Bag.java:236)
> >> >> >> > at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.builtin.Distinct.getDistinctFromNestedBags(Distinct.java:136)
> >> >> >> >  at org.apache.pig.builtin.Distinct.access$200(Distinct.java:38)
> >> >> >> > at
> >> >> org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:101)
> >> >> >> >  at
> >> >> org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:94)
> >> >> >> > at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperator
> >> >> >> > s.POUserFunc.getNext(POUserFunc.java:216)
> >> >> >> >  at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperator
> >> >> >> > s.POUserFunc.getNext(POUserFunc.java:253)
> >> >> >> > at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.g
> >> >> >> > etNext(PhysicalOperator.java:334)
> >> >> >> >  at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> >> >> >> > s.POForEach.processPlan(POForEach.java:332)
> >> >> >> > at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> >> >> >> > s.POForEach.getNext(POForEach.java:284)
> >> >> >> >  at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.p
> >> >> >> > rocessInput(PhysicalOperator.java:290)
> >> >> >> > at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> >> >> >> > s.POLocalRearrange.getNext(POLocalRearrange.java:256)
> >> >> >> >  at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> >> >> > ne.processOnePackageOutput(PigCombiner.java:184)
> >> >> >> > at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> >> >> > ne.reduce(PigCombiner.java:162)
> >> >> >> >  at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> >> >> > ne.reduce(PigCombiner.java:51)
> >> >> >> > at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
> >> >> >> >  at
> >> >> >>
> >> org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
> >> >> >> > at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:126
> >> >> >> > 5)
> >> >> >> >  at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
> >> >> >> > at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:
> >> >> >> > 1173)
> >> >> >> >
> >> >> >> > Error: java.lang.OutOfMemoryError: Java heap space
> >> >> >> > at java.util.ArrayList.(ArrayList.java:112)
> >> >> >> >  at java.util.ArrayList.(ArrayList.java:119)
> >> >> >> > at org.apache.pig.data.DefaultDataBag.(DefaultDataBag.java:54)
> >> >> >> >  at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.data.DefaultBagFactory.newDefaultBag(DefaultBagFactory.java:33)
> >> >> >> > at
> >> org.apache.pig.data.BinInterSedes.readBag(BinInterSedes.java:143)
> >> >> >> >  at
> >> >> org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:275)
> >> >> >> > at
> >> org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
> >> >> >> >  at
> >> >> org.apache.pig.data.BinInterSedes.readTuple(BinInterSedes.java:111)
> >> >> >> > at
> >> org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:270)
> >> >> >> >  at
> >> >> org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
> >> >> >> > at
> >> >> >>
> >> org.apache.pig.data.BinInterSedes.addColsToTuple(BinInterSedes.java:555)
> >> >> >> >  at
> >> >> org.apache.pig.data.BinSedesTuple.readFields(BinSedesTuple.java:64)
> >> >> >> > at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.impl.io.PigNullableWritable.readFields(PigNullableWritable.java
> >> >> >> > :114)
> >> >> >> >  at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.des
> >> >> >> > erialize(WritableSerialization.java:67)
> >> >> >> > at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.des
> >> >> >> > erialize(WritableSerialization.java:40)
> >> >> >> >  at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.hadoop.mapreduce.ReduceContext.nextKeyValue(ReduceContext.java:116)
> >> >> >> > at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.hadoop.mapreduce.ReduceContext$ValueIterator.next(ReduceContext.jav
> >> >> >> > a:163)
> >> >> >> >  at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> >> >> >> > s.POCombinerPackage.getNext(POCombinerPackage.java:141)
> >> >> >> > at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> >> >> > ne.processOnePackageOutput(PigCombiner.java:171)
> >> >> >> >  at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> >> >> > ne.reduce(PigCombiner.java:162)
> >> >> >> > at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> >> >> > ne.reduce(PigCombiner.java:51)
> >> >> >> >  at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
> >> >> >> > at
> >> >> >>
> >> org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
> >> >> >> >  at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:126
> >> >> >> > 5)
> >> >> >> > at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
> >> >> >> >  at
> >> >> >> >
> >> >> >>
> >> >>
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:
> >> >> >> > 1173)
> >> >> >> >
> >> >> >> > Any idea and suggestion what could be cause of error message?
> >> >> >> >
> >> >> >> > Thanks for any help,
> >> >> >> > Sonia
> >> >> >> >
> >> >> >>
> >> >> >>
> >> >> >> --
> >> >> >>
> >> >> >>
> >> >> >>
> >> >> >
> >> >>
> >> >
> >>
> >
>

Re: java.lang.OutOfMemoryError while running Pig Job

Posted by Dmitriy Ryaboy <dv...@gmail.com>.
you can group by your key + the thing you want distinct counts of, and
generate counts of those.

On Mon, May 23, 2011 at 2:17 PM, sonia gehlot <so...@gmail.com> wrote:
> Hi Shawn,
>
> I tried using SUBSTRING in my script with different combinations but still
> getting OOM errors.
>
> is there is any other alternative to use distinct - count against very large
> set of data.
>
> Thanks,
> Sonia
>
> On Fri, May 20, 2011 at 1:54 PM, Xiaomeng Wan <sh...@gmail.com> wrote:
>
>> It servers two purposes:
>> 1. divide the group into smaller subgroups
>> 2. make sure distinct in subgroup => distinct in group
>>
>> Shawn
>>
>> On Fri, May 20, 2011 at 2:20 PM, sonia gehlot <so...@gmail.com>
>> wrote:
>> > Hey, I am sorry but I din't get how substring will help in this?
>> >
>> > On Fri, May 20, 2011 at 1:08 PM, Xiaomeng Wan <sh...@gmail.com>
>> wrote:
>> >
>> >> you can try using some divide and conquer, like this:
>> >>
>> >> a = group data by (key, SUBSTRING(the_field_to_be_distinct, 0, 2));
>> >> b = foreach a { x = distinct a.he_field_to_be_distinct; generate
>> >> group.key as key, COUNT(x) as cnt; }
>> >> c = group b by key;
>> >> d = foreach c generate group as key, SUM(b.cnt) as cnt;
>> >>
>> >> using longer substring if still running into OOM.
>> >>
>> >> Regards,
>> >> Shawn
>> >>
>> >> On Fri, May 20, 2011 at 1:11 PM, sonia gehlot <so...@gmail.com>
>> >> wrote:
>> >> > Hey Thejas,
>> >> >
>> >> > I tried setting up property pig.cachedbag.memusage to 0.1 and also
>> tried
>> >> > computing distinct count for each type separately but still I am
>> getting
>> >> > errors like
>> >> >
>> >> > Error: java.lang.OutOfMemoryError: Java heap space
>> >> > Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
>> >> > java.io.IOException: Spill failed
>> >> >
>> >> > Is there is some other way to do distinct - count that you may
>> suggest?
>> >> >
>> >> > Thanks for your help.
>> >> >
>> >> > Sonia
>> >> >
>> >> > On Fri, May 13, 2011 at 4:46 PM, Thejas M Nair <te...@yahoo-inc.com>
>> >> wrote:
>> >> >
>> >> >> The stack trace shows that the OOM error is happening when the
>> distinct
>> >> is
>> >> >> being applied. It looks like in some record(s) of the relation
>> group_it,
>> >> >> one
>> >> >> more of the following bags is very large - logic.c_users,
>> >>  logic.nc_users
>> >> >> or
>> >> >> logic.registered_users;
>> >> >>
>> >> >> Try setting the property pig.cachedbag.memusage to 0.1 or lower (
>> >> >> -Dpig.cachedbag.memusage=0.1 on java command line). It controls the
>> >> memory
>> >> >> used by pig internal bags, including those used by distinct.
>> >> >>
>> >> >> If that does not work, you can try computing count-distinct for each
>> >> type
>> >> >> of
>> >> >> user separately and then combining the result.
>> >> >>
>> >> >>
>> >> >> You might want to have a look at this way of optimizing
>> count-distinct
>> >> >> queries where skew can be a problem -
>> >> >> https://issues.apache.org/jira/browse/PIG-1846
>> >> >>
>> >> >> -thejas
>> >> >>
>> >> >>
>> >> >>
>> >> >>
>> >> >>
>> >> >> On 5/12/11 10:43 AM, "sonia gehlot" <so...@gmail.com> wrote:
>> >> >>
>> >> >> > Hi Guys,
>> >> >> >
>> >> >> > I am running following Pig script in Pig 0.8 version
>> >> >> >
>> >> >> > page_events = LOAD '/user/sgehlot/day=2011-05-10' as
>> >> >> >
>> >> >>
>> >>
>> (event_dt_ht:chararray,event_dt_ut:chararray,event_rec_num:int,event_type:int,
>> >> >> >
>> >> >>
>> >>
>> client_ip_addr:long,hub_id:int,is_cookied_user:int,local_ontology_node_id:int,
>> >> >> >
>> >> >>
>> >>
>> page_type_id:int,content_id:int,product_id:int,referrer_edition_id:int,page_nu
>> >> >> >
>> >> >>
>> >>
>> mber:int,is_iab_robot:int,browser_id:int,os_id:int,dw_pubsys_id:int,refresh:in
>> >> >> >
>> >> >>
>> >>
>> t,asset_id:int,asset_type_id:int,content_type_id:int,product_type_id:int,outbo
>> >> >> >
>> >> >>
>> >>
>> und_email_id:long,gbal_clc:int,mtype:int,user_action_id:int,referring_partner_
>> >> >> >
>> >> >>
>> >>
>> id:int,ontology_node_id:int,content_namespace_id:int,product_namespace_id:int,
>> >> >> >
>> >> >>
>> >>
>> transparent_edition_id:int,default_edition_id:int,event_seq_num:int,is_last_pa
>> >> >> >
>> >> >>
>> >>
>> ge:int,is_new_user:int,page_duration:int,page_seq_num:int,session_id:long,time
>> >> >> >
>> >> >>
>> >>
>> _since_sess_start:int,reg_cookie:chararray,urs_app_id:int,is_reg_user:int,edit
>> >> >> >
>> >> >>
>> >>
>> ion_id:int,user_agent_id:int,page_type_key:int,referrer_id:int,channel_id:int,
>> >> >> >
>> >> >>
>> >>
>> level2_id:int,level3_id:int,brand_id:int,content_key:int,product_key:int,editi
>> >> >> >
>> >> >>
>> >>
>> on_key:int,partner_key:int,business_unit_id:int,anon_cookie:chararray,machine_
>> >> >> >
>> >> >>
>> >>
>> name:chararray,pagehost:chararray,filenameextension:chararray,referrerpath:cha
>> >> >> >
>> >> >>
>> >>
>> rarray,referrerhost:chararray,referring_oid:chararray,referring_legacy_oid:cha
>> >> >> >
>> >> >>
>> >>
>> rarray,ctype:chararray,cval:chararray,link_tag:chararray,link_type:chararray,s
>> >> >> >
>> >> >>
>> >>
>> ticky_tag:chararray,page_url:chararray,search_category:chararray,partner_subje
>> >> >> >
>> >> >>
>> >>
>> ct:chararray,referring_partner_name:chararray,robot_pattern:chararray,browser:
>> >> >> >
>> >> >>
>> >>
>> chararray,browser_major_version:chararray,browser_minor_version:chararray,os:c
>> >> >> >
>> >> >>
>> >>
>> hararray,os_family:chararray,ttag:chararray,dest_oid:chararray,global_id:chara
>> >> >> >
>> >> >>
>> >>
>> rray,hostname:chararray,path:chararray,filename:chararray,extension:chararray,
>> >> >> >
>> >> >>
>> >>
>> query:chararray,user_agent:chararray,xrq:chararray,xref:chararray,page_guid:ch
>> >> >> >
>> >> >>
>> >>
>> ararray,test_name:chararray,test_group:chararray,test_version:chararray,page_v
>> >> >> >
>> >> >>
>> >>
>> ersion:chararray,o_sticky_tag:chararray,new_referring_oid:chararray,day:charar
>> >> >> >
>> >> >>
>> >>
>> ray,network_ip:int,site_id:int,search_phrase:chararray,search_attributes:chara
>> >> >> >
>> >> >>
>> >>
>> rray,web_search_phrase:chararray,ip_address:chararray,is_pattern_match_robot:i
>> >> >> >
>> >> >>
>> >>
>> nt,protocol:chararray,skc_title:chararray,skc_url:chararray,has_site_search_ph
>> >> >> >
>> >> >>
>> >>
>> rase:int,has_site_search_attribs:int,has_web_search_phrase:int,title_id:charar
>> >> >> > ray,url_id:chararray,network_rev:int);
>> >> >> >
>> >> >> > referrer_group_map = LOAD
>> >> >> '/user/sgehlot/oozie/db_data/referrer_group_map'
>> >> >> > as
>> >> >> > (referrer_id:int, has_web_search_phrase:int, hostname:chararray,
>> >> >> > referral_type_id:int,
>> >> >> > referral_type_name:chararray,
>> >> >> >
>> >> >>
>> >>
>> referrer_group_id:int,referrer_group_name:chararray,referrer_group_cat_id:int,
>> >> >> > referrer_group_cat:chararray);
>> >> >> >
>> >> >> > filter_pe = FILTER page_events BY is_iab_robot == 0 AND
>> >> >> > is_pattern_match_robot == 0 AND day == '2011-05-10';
>> >> >> >
>> >> >> > select_pe_col = FOREACH filter_pe GENERATE day, is_cookied_user,
>> >> >> > anon_cookie, reg_cookie, referrer_id, has_web_search_phrase,
>> >> >> > business_unit_id;
>> >> >> >
>> >> >> > select_ref_col = FOREACH referrer_group_map GENERATE referrer_id,
>> >> >> > has_web_search_phrase, referral_type_id;
>> >> >> >
>> >> >> > jn = JOIN select_ref_col BY (referrer_id, has_web_search_phrase),
>> >> >> > select_pe_col BY (referrer_id, has_web_search_phrase);
>> >> >> >
>> >> >> >
>> >> >> > logic = FOREACH jn GENERATE
>> >> >> > select_pe_col::day,
>> >> >> > select_ref_col::referral_type_id,
>> >> >> >  select_pe_col::business_unit_id,
>> >> >> > ((select_pe_col::is_cookied_user == 1) ? select_pe_col::anon_cookie
>> :
>> >> >> null)
>> >> >> > as c_users,
>> >> >> >  ((select_pe_col::is_cookied_user == 0) ?
>> select_pe_col::anon_cookie :
>> >> >> null)
>> >> >> > as nc_users,
>> >> >> > ((select_pe_col::reg_cookie == '-1') ? null :
>> >> select_pe_col::reg_cookie)
>> >> >> as
>> >> >> > registered_users;
>> >> >> >  group_it = GROUP logic BY (select_pe_col::day,
>> >> >> >    select_ref_col::referral_type_id,
>> >> >> >    select_pe_col::business_unit_id);
>> >> >> >
>> >> >> > agg_results = FOREACH group_it{
>> >> >> >  dst_c_users = DISTINCT logic.c_users;
>> >> >> > dst_nc_users = DISTINCT logic.nc_users;
>> >> >> >  dst_registered_users = DISTINCT logic.registered_users;
>> >> >> >       GENERATE
>> >> >> >        group.select_pe_col::day,
>> >> >> >         group.select_ref_col::referral_type_id,
>> >> >> > group.select_pe_col::business_unit_id,
>> >> >> >  COUNT(dst_c_users) as c_users,
>> >> >> > COUNT(dst_nc_users) as nc_users,
>> >> >> > COUNT(dst_registered_users) as registered_users;
>> >> >> >  };
>> >> >> >
>> >> >> > STORE agg_results INTO
>> '/user/sgehlot/pt_users_referral_type_bu_day'
>> >> >> USING
>> >> >> > PigStorage('\t');
>> >> >> >
>> >> >> >
>> >> >> > But I am keep on failing because of Java out of memory error. Data
>> set
>> >> in
>> >> >> > "page_events" is huge but in "referrer_group_map" is relatively
>> very
>> >> >> less.
>> >> >> >
>> >> >> > Here is error message:
>> >> >> >
>> >> >> > Error: java.lang.OutOfMemoryError: GC overhead limit exceeded
>> >> >> > at java.util.ArrayList.(ArrayList.java:112)
>> >> >> >  at java.util.ArrayList.(ArrayList.java:119)
>> >> >> > at org.apache.pig.data.DefaultTuple.(DefaultTuple.java:59)
>> >> >> >  at org.apache.pig.data.BinSedesTuple.(BinSedesTuple.java:73)
>> >> >> > at
>> >> >> >
>> >> >>
>> >> >>
>> >>
>> org.apache.pig.data.BinSedesTupleFactory.newTuple(BinSedesTupleFactory.java:33>
>> >> >> )
>> >> >> >  at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.data.InternalCachedBag$CachedBagIterator.hasNext(InternalCached
>> >> >> > Bag.java:236)
>> >> >> > at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.builtin.Distinct.getDistinctFromNestedBags(Distinct.java:136)
>> >> >> >  at org.apache.pig.builtin.Distinct.access$200(Distinct.java:38)
>> >> >> > at
>> >> org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:101)
>> >> >> >  at
>> >> org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:94)
>> >> >> > at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperator
>> >> >> > s.POUserFunc.getNext(POUserFunc.java:216)
>> >> >> >  at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperator
>> >> >> > s.POUserFunc.getNext(POUserFunc.java:253)
>> >> >> > at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.g
>> >> >> > etNext(PhysicalOperator.java:334)
>> >> >> >  at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
>> >> >> > s.POForEach.processPlan(POForEach.java:332)
>> >> >> > at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
>> >> >> > s.POForEach.getNext(POForEach.java:284)
>> >> >> >  at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.p
>> >> >> > rocessInput(PhysicalOperator.java:290)
>> >> >> > at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
>> >> >> > s.POLocalRearrange.getNext(POLocalRearrange.java:256)
>> >> >> >  at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> >> >> > ne.processOnePackageOutput(PigCombiner.java:184)
>> >> >> > at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> >> >> > ne.reduce(PigCombiner.java:162)
>> >> >> >  at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> >> >> > ne.reduce(PigCombiner.java:51)
>> >> >> > at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
>> >> >> >  at
>> >> >>
>> org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
>> >> >> > at
>> >> >> >
>> >> >>
>> >>
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:126
>> >> >> > 5)
>> >> >> >  at
>> >> >> >
>> >> >>
>> >>
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
>> >> >> > at
>> >> >> >
>> >> >>
>> >>
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:
>> >> >> > 1173)
>> >> >> >
>> >> >> > Error: java.lang.OutOfMemoryError: Java heap space
>> >> >> > at java.util.ArrayList.(ArrayList.java:112)
>> >> >> >  at java.util.ArrayList.(ArrayList.java:119)
>> >> >> > at org.apache.pig.data.DefaultDataBag.(DefaultDataBag.java:54)
>> >> >> >  at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.data.DefaultBagFactory.newDefaultBag(DefaultBagFactory.java:33)
>> >> >> > at
>> org.apache.pig.data.BinInterSedes.readBag(BinInterSedes.java:143)
>> >> >> >  at
>> >> org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:275)
>> >> >> > at
>> org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
>> >> >> >  at
>> >> org.apache.pig.data.BinInterSedes.readTuple(BinInterSedes.java:111)
>> >> >> > at
>> org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:270)
>> >> >> >  at
>> >> org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
>> >> >> > at
>> >> >>
>> org.apache.pig.data.BinInterSedes.addColsToTuple(BinInterSedes.java:555)
>> >> >> >  at
>> >> org.apache.pig.data.BinSedesTuple.readFields(BinSedesTuple.java:64)
>> >> >> > at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.impl.io.PigNullableWritable.readFields(PigNullableWritable.java
>> >> >> > :114)
>> >> >> >  at
>> >> >> >
>> >> >>
>> >>
>> org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.des
>> >> >> > erialize(WritableSerialization.java:67)
>> >> >> > at
>> >> >> >
>> >> >>
>> >>
>> org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.des
>> >> >> > erialize(WritableSerialization.java:40)
>> >> >> >  at
>> >> >> >
>> >> >>
>> >>
>> org.apache.hadoop.mapreduce.ReduceContext.nextKeyValue(ReduceContext.java:116)
>> >> >> > at
>> >> >> >
>> >> >>
>> >>
>> org.apache.hadoop.mapreduce.ReduceContext$ValueIterator.next(ReduceContext.jav
>> >> >> > a:163)
>> >> >> >  at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
>> >> >> > s.POCombinerPackage.getNext(POCombinerPackage.java:141)
>> >> >> > at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> >> >> > ne.processOnePackageOutput(PigCombiner.java:171)
>> >> >> >  at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> >> >> > ne.reduce(PigCombiner.java:162)
>> >> >> > at
>> >> >> >
>> >> >>
>> >>
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> >> >> > ne.reduce(PigCombiner.java:51)
>> >> >> >  at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
>> >> >> > at
>> >> >>
>> org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
>> >> >> >  at
>> >> >> >
>> >> >>
>> >>
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:126
>> >> >> > 5)
>> >> >> > at
>> >> >> >
>> >> >>
>> >>
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
>> >> >> >  at
>> >> >> >
>> >> >>
>> >>
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:
>> >> >> > 1173)
>> >> >> >
>> >> >> > Any idea and suggestion what could be cause of error message?
>> >> >> >
>> >> >> > Thanks for any help,
>> >> >> > Sonia
>> >> >> >
>> >> >>
>> >> >>
>> >> >> --
>> >> >>
>> >> >>
>> >> >>
>> >> >
>> >>
>> >
>>
>

Re: java.lang.OutOfMemoryError while running Pig Job

Posted by sonia gehlot <so...@gmail.com>.
Hi Shawn,

I tried using SUBSTRING in my script with different combinations but still
getting OOM errors.

is there is any other alternative to use distinct - count against very large
set of data.

Thanks,
Sonia

On Fri, May 20, 2011 at 1:54 PM, Xiaomeng Wan <sh...@gmail.com> wrote:

> It servers two purposes:
> 1. divide the group into smaller subgroups
> 2. make sure distinct in subgroup => distinct in group
>
> Shawn
>
> On Fri, May 20, 2011 at 2:20 PM, sonia gehlot <so...@gmail.com>
> wrote:
> > Hey, I am sorry but I din't get how substring will help in this?
> >
> > On Fri, May 20, 2011 at 1:08 PM, Xiaomeng Wan <sh...@gmail.com>
> wrote:
> >
> >> you can try using some divide and conquer, like this:
> >>
> >> a = group data by (key, SUBSTRING(the_field_to_be_distinct, 0, 2));
> >> b = foreach a { x = distinct a.he_field_to_be_distinct; generate
> >> group.key as key, COUNT(x) as cnt; }
> >> c = group b by key;
> >> d = foreach c generate group as key, SUM(b.cnt) as cnt;
> >>
> >> using longer substring if still running into OOM.
> >>
> >> Regards,
> >> Shawn
> >>
> >> On Fri, May 20, 2011 at 1:11 PM, sonia gehlot <so...@gmail.com>
> >> wrote:
> >> > Hey Thejas,
> >> >
> >> > I tried setting up property pig.cachedbag.memusage to 0.1 and also
> tried
> >> > computing distinct count for each type separately but still I am
> getting
> >> > errors like
> >> >
> >> > Error: java.lang.OutOfMemoryError: Java heap space
> >> > Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
> >> > java.io.IOException: Spill failed
> >> >
> >> > Is there is some other way to do distinct - count that you may
> suggest?
> >> >
> >> > Thanks for your help.
> >> >
> >> > Sonia
> >> >
> >> > On Fri, May 13, 2011 at 4:46 PM, Thejas M Nair <te...@yahoo-inc.com>
> >> wrote:
> >> >
> >> >> The stack trace shows that the OOM error is happening when the
> distinct
> >> is
> >> >> being applied. It looks like in some record(s) of the relation
> group_it,
> >> >> one
> >> >> more of the following bags is very large - logic.c_users,
> >>  logic.nc_users
> >> >> or
> >> >> logic.registered_users;
> >> >>
> >> >> Try setting the property pig.cachedbag.memusage to 0.1 or lower (
> >> >> -Dpig.cachedbag.memusage=0.1 on java command line). It controls the
> >> memory
> >> >> used by pig internal bags, including those used by distinct.
> >> >>
> >> >> If that does not work, you can try computing count-distinct for each
> >> type
> >> >> of
> >> >> user separately and then combining the result.
> >> >>
> >> >>
> >> >> You might want to have a look at this way of optimizing
> count-distinct
> >> >> queries where skew can be a problem -
> >> >> https://issues.apache.org/jira/browse/PIG-1846
> >> >>
> >> >> -thejas
> >> >>
> >> >>
> >> >>
> >> >>
> >> >>
> >> >> On 5/12/11 10:43 AM, "sonia gehlot" <so...@gmail.com> wrote:
> >> >>
> >> >> > Hi Guys,
> >> >> >
> >> >> > I am running following Pig script in Pig 0.8 version
> >> >> >
> >> >> > page_events = LOAD '/user/sgehlot/day=2011-05-10' as
> >> >> >
> >> >>
> >>
> (event_dt_ht:chararray,event_dt_ut:chararray,event_rec_num:int,event_type:int,
> >> >> >
> >> >>
> >>
> client_ip_addr:long,hub_id:int,is_cookied_user:int,local_ontology_node_id:int,
> >> >> >
> >> >>
> >>
> page_type_id:int,content_id:int,product_id:int,referrer_edition_id:int,page_nu
> >> >> >
> >> >>
> >>
> mber:int,is_iab_robot:int,browser_id:int,os_id:int,dw_pubsys_id:int,refresh:in
> >> >> >
> >> >>
> >>
> t,asset_id:int,asset_type_id:int,content_type_id:int,product_type_id:int,outbo
> >> >> >
> >> >>
> >>
> und_email_id:long,gbal_clc:int,mtype:int,user_action_id:int,referring_partner_
> >> >> >
> >> >>
> >>
> id:int,ontology_node_id:int,content_namespace_id:int,product_namespace_id:int,
> >> >> >
> >> >>
> >>
> transparent_edition_id:int,default_edition_id:int,event_seq_num:int,is_last_pa
> >> >> >
> >> >>
> >>
> ge:int,is_new_user:int,page_duration:int,page_seq_num:int,session_id:long,time
> >> >> >
> >> >>
> >>
> _since_sess_start:int,reg_cookie:chararray,urs_app_id:int,is_reg_user:int,edit
> >> >> >
> >> >>
> >>
> ion_id:int,user_agent_id:int,page_type_key:int,referrer_id:int,channel_id:int,
> >> >> >
> >> >>
> >>
> level2_id:int,level3_id:int,brand_id:int,content_key:int,product_key:int,editi
> >> >> >
> >> >>
> >>
> on_key:int,partner_key:int,business_unit_id:int,anon_cookie:chararray,machine_
> >> >> >
> >> >>
> >>
> name:chararray,pagehost:chararray,filenameextension:chararray,referrerpath:cha
> >> >> >
> >> >>
> >>
> rarray,referrerhost:chararray,referring_oid:chararray,referring_legacy_oid:cha
> >> >> >
> >> >>
> >>
> rarray,ctype:chararray,cval:chararray,link_tag:chararray,link_type:chararray,s
> >> >> >
> >> >>
> >>
> ticky_tag:chararray,page_url:chararray,search_category:chararray,partner_subje
> >> >> >
> >> >>
> >>
> ct:chararray,referring_partner_name:chararray,robot_pattern:chararray,browser:
> >> >> >
> >> >>
> >>
> chararray,browser_major_version:chararray,browser_minor_version:chararray,os:c
> >> >> >
> >> >>
> >>
> hararray,os_family:chararray,ttag:chararray,dest_oid:chararray,global_id:chara
> >> >> >
> >> >>
> >>
> rray,hostname:chararray,path:chararray,filename:chararray,extension:chararray,
> >> >> >
> >> >>
> >>
> query:chararray,user_agent:chararray,xrq:chararray,xref:chararray,page_guid:ch
> >> >> >
> >> >>
> >>
> ararray,test_name:chararray,test_group:chararray,test_version:chararray,page_v
> >> >> >
> >> >>
> >>
> ersion:chararray,o_sticky_tag:chararray,new_referring_oid:chararray,day:charar
> >> >> >
> >> >>
> >>
> ray,network_ip:int,site_id:int,search_phrase:chararray,search_attributes:chara
> >> >> >
> >> >>
> >>
> rray,web_search_phrase:chararray,ip_address:chararray,is_pattern_match_robot:i
> >> >> >
> >> >>
> >>
> nt,protocol:chararray,skc_title:chararray,skc_url:chararray,has_site_search_ph
> >> >> >
> >> >>
> >>
> rase:int,has_site_search_attribs:int,has_web_search_phrase:int,title_id:charar
> >> >> > ray,url_id:chararray,network_rev:int);
> >> >> >
> >> >> > referrer_group_map = LOAD
> >> >> '/user/sgehlot/oozie/db_data/referrer_group_map'
> >> >> > as
> >> >> > (referrer_id:int, has_web_search_phrase:int, hostname:chararray,
> >> >> > referral_type_id:int,
> >> >> > referral_type_name:chararray,
> >> >> >
> >> >>
> >>
> referrer_group_id:int,referrer_group_name:chararray,referrer_group_cat_id:int,
> >> >> > referrer_group_cat:chararray);
> >> >> >
> >> >> > filter_pe = FILTER page_events BY is_iab_robot == 0 AND
> >> >> > is_pattern_match_robot == 0 AND day == '2011-05-10';
> >> >> >
> >> >> > select_pe_col = FOREACH filter_pe GENERATE day, is_cookied_user,
> >> >> > anon_cookie, reg_cookie, referrer_id, has_web_search_phrase,
> >> >> > business_unit_id;
> >> >> >
> >> >> > select_ref_col = FOREACH referrer_group_map GENERATE referrer_id,
> >> >> > has_web_search_phrase, referral_type_id;
> >> >> >
> >> >> > jn = JOIN select_ref_col BY (referrer_id, has_web_search_phrase),
> >> >> > select_pe_col BY (referrer_id, has_web_search_phrase);
> >> >> >
> >> >> >
> >> >> > logic = FOREACH jn GENERATE
> >> >> > select_pe_col::day,
> >> >> > select_ref_col::referral_type_id,
> >> >> >  select_pe_col::business_unit_id,
> >> >> > ((select_pe_col::is_cookied_user == 1) ? select_pe_col::anon_cookie
> :
> >> >> null)
> >> >> > as c_users,
> >> >> >  ((select_pe_col::is_cookied_user == 0) ?
> select_pe_col::anon_cookie :
> >> >> null)
> >> >> > as nc_users,
> >> >> > ((select_pe_col::reg_cookie == '-1') ? null :
> >> select_pe_col::reg_cookie)
> >> >> as
> >> >> > registered_users;
> >> >> >  group_it = GROUP logic BY (select_pe_col::day,
> >> >> >    select_ref_col::referral_type_id,
> >> >> >    select_pe_col::business_unit_id);
> >> >> >
> >> >> > agg_results = FOREACH group_it{
> >> >> >  dst_c_users = DISTINCT logic.c_users;
> >> >> > dst_nc_users = DISTINCT logic.nc_users;
> >> >> >  dst_registered_users = DISTINCT logic.registered_users;
> >> >> >       GENERATE
> >> >> >        group.select_pe_col::day,
> >> >> >         group.select_ref_col::referral_type_id,
> >> >> > group.select_pe_col::business_unit_id,
> >> >> >  COUNT(dst_c_users) as c_users,
> >> >> > COUNT(dst_nc_users) as nc_users,
> >> >> > COUNT(dst_registered_users) as registered_users;
> >> >> >  };
> >> >> >
> >> >> > STORE agg_results INTO
> '/user/sgehlot/pt_users_referral_type_bu_day'
> >> >> USING
> >> >> > PigStorage('\t');
> >> >> >
> >> >> >
> >> >> > But I am keep on failing because of Java out of memory error. Data
> set
> >> in
> >> >> > "page_events" is huge but in "referrer_group_map" is relatively
> very
> >> >> less.
> >> >> >
> >> >> > Here is error message:
> >> >> >
> >> >> > Error: java.lang.OutOfMemoryError: GC overhead limit exceeded
> >> >> > at java.util.ArrayList.(ArrayList.java:112)
> >> >> >  at java.util.ArrayList.(ArrayList.java:119)
> >> >> > at org.apache.pig.data.DefaultTuple.(DefaultTuple.java:59)
> >> >> >  at org.apache.pig.data.BinSedesTuple.(BinSedesTuple.java:73)
> >> >> > at
> >> >> >
> >> >>
> >> >>
> >>
> org.apache.pig.data.BinSedesTupleFactory.newTuple(BinSedesTupleFactory.java:33>
> >> >> )
> >> >> >  at
> >> >> >
> >> >>
> >>
> org.apache.pig.data.InternalCachedBag$CachedBagIterator.hasNext(InternalCached
> >> >> > Bag.java:236)
> >> >> > at
> >> >> >
> >> >>
> >>
> org.apache.pig.builtin.Distinct.getDistinctFromNestedBags(Distinct.java:136)
> >> >> >  at org.apache.pig.builtin.Distinct.access$200(Distinct.java:38)
> >> >> > at
> >> org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:101)
> >> >> >  at
> >> org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:94)
> >> >> > at
> >> >> >
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperator
> >> >> > s.POUserFunc.getNext(POUserFunc.java:216)
> >> >> >  at
> >> >> >
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperator
> >> >> > s.POUserFunc.getNext(POUserFunc.java:253)
> >> >> > at
> >> >> >
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.g
> >> >> > etNext(PhysicalOperator.java:334)
> >> >> >  at
> >> >> >
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> >> >> > s.POForEach.processPlan(POForEach.java:332)
> >> >> > at
> >> >> >
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> >> >> > s.POForEach.getNext(POForEach.java:284)
> >> >> >  at
> >> >> >
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.p
> >> >> > rocessInput(PhysicalOperator.java:290)
> >> >> > at
> >> >> >
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> >> >> > s.POLocalRearrange.getNext(POLocalRearrange.java:256)
> >> >> >  at
> >> >> >
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> >> > ne.processOnePackageOutput(PigCombiner.java:184)
> >> >> > at
> >> >> >
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> >> > ne.reduce(PigCombiner.java:162)
> >> >> >  at
> >> >> >
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> >> > ne.reduce(PigCombiner.java:51)
> >> >> > at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
> >> >> >  at
> >> >>
> org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
> >> >> > at
> >> >> >
> >> >>
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:126
> >> >> > 5)
> >> >> >  at
> >> >> >
> >> >>
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
> >> >> > at
> >> >> >
> >> >>
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:
> >> >> > 1173)
> >> >> >
> >> >> > Error: java.lang.OutOfMemoryError: Java heap space
> >> >> > at java.util.ArrayList.(ArrayList.java:112)
> >> >> >  at java.util.ArrayList.(ArrayList.java:119)
> >> >> > at org.apache.pig.data.DefaultDataBag.(DefaultDataBag.java:54)
> >> >> >  at
> >> >> >
> >> >>
> >>
> org.apache.pig.data.DefaultBagFactory.newDefaultBag(DefaultBagFactory.java:33)
> >> >> > at
> org.apache.pig.data.BinInterSedes.readBag(BinInterSedes.java:143)
> >> >> >  at
> >> org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:275)
> >> >> > at
> org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
> >> >> >  at
> >> org.apache.pig.data.BinInterSedes.readTuple(BinInterSedes.java:111)
> >> >> > at
> org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:270)
> >> >> >  at
> >> org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
> >> >> > at
> >> >>
> org.apache.pig.data.BinInterSedes.addColsToTuple(BinInterSedes.java:555)
> >> >> >  at
> >> org.apache.pig.data.BinSedesTuple.readFields(BinSedesTuple.java:64)
> >> >> > at
> >> >> >
> >> >>
> >>
> org.apache.pig.impl.io.PigNullableWritable.readFields(PigNullableWritable.java
> >> >> > :114)
> >> >> >  at
> >> >> >
> >> >>
> >>
> org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.des
> >> >> > erialize(WritableSerialization.java:67)
> >> >> > at
> >> >> >
> >> >>
> >>
> org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.des
> >> >> > erialize(WritableSerialization.java:40)
> >> >> >  at
> >> >> >
> >> >>
> >>
> org.apache.hadoop.mapreduce.ReduceContext.nextKeyValue(ReduceContext.java:116)
> >> >> > at
> >> >> >
> >> >>
> >>
> org.apache.hadoop.mapreduce.ReduceContext$ValueIterator.next(ReduceContext.jav
> >> >> > a:163)
> >> >> >  at
> >> >> >
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> >> >> > s.POCombinerPackage.getNext(POCombinerPackage.java:141)
> >> >> > at
> >> >> >
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> >> > ne.processOnePackageOutput(PigCombiner.java:171)
> >> >> >  at
> >> >> >
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> >> > ne.reduce(PigCombiner.java:162)
> >> >> > at
> >> >> >
> >> >>
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> >> > ne.reduce(PigCombiner.java:51)
> >> >> >  at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
> >> >> > at
> >> >>
> org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
> >> >> >  at
> >> >> >
> >> >>
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:126
> >> >> > 5)
> >> >> > at
> >> >> >
> >> >>
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
> >> >> >  at
> >> >> >
> >> >>
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:
> >> >> > 1173)
> >> >> >
> >> >> > Any idea and suggestion what could be cause of error message?
> >> >> >
> >> >> > Thanks for any help,
> >> >> > Sonia
> >> >> >
> >> >>
> >> >>
> >> >> --
> >> >>
> >> >>
> >> >>
> >> >
> >>
> >
>

Re: java.lang.OutOfMemoryError while running Pig Job

Posted by Xiaomeng Wan <sh...@gmail.com>.
It servers two purposes:
1. divide the group into smaller subgroups
2. make sure distinct in subgroup => distinct in group

Shawn

On Fri, May 20, 2011 at 2:20 PM, sonia gehlot <so...@gmail.com> wrote:
> Hey, I am sorry but I din't get how substring will help in this?
>
> On Fri, May 20, 2011 at 1:08 PM, Xiaomeng Wan <sh...@gmail.com> wrote:
>
>> you can try using some divide and conquer, like this:
>>
>> a = group data by (key, SUBSTRING(the_field_to_be_distinct, 0, 2));
>> b = foreach a { x = distinct a.he_field_to_be_distinct; generate
>> group.key as key, COUNT(x) as cnt; }
>> c = group b by key;
>> d = foreach c generate group as key, SUM(b.cnt) as cnt;
>>
>> using longer substring if still running into OOM.
>>
>> Regards,
>> Shawn
>>
>> On Fri, May 20, 2011 at 1:11 PM, sonia gehlot <so...@gmail.com>
>> wrote:
>> > Hey Thejas,
>> >
>> > I tried setting up property pig.cachedbag.memusage to 0.1 and also tried
>> > computing distinct count for each type separately but still I am getting
>> > errors like
>> >
>> > Error: java.lang.OutOfMemoryError: Java heap space
>> > Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
>> > java.io.IOException: Spill failed
>> >
>> > Is there is some other way to do distinct - count that you may suggest?
>> >
>> > Thanks for your help.
>> >
>> > Sonia
>> >
>> > On Fri, May 13, 2011 at 4:46 PM, Thejas M Nair <te...@yahoo-inc.com>
>> wrote:
>> >
>> >> The stack trace shows that the OOM error is happening when the distinct
>> is
>> >> being applied. It looks like in some record(s) of the relation group_it,
>> >> one
>> >> more of the following bags is very large - logic.c_users,
>>  logic.nc_users
>> >> or
>> >> logic.registered_users;
>> >>
>> >> Try setting the property pig.cachedbag.memusage to 0.1 or lower (
>> >> -Dpig.cachedbag.memusage=0.1 on java command line). It controls the
>> memory
>> >> used by pig internal bags, including those used by distinct.
>> >>
>> >> If that does not work, you can try computing count-distinct for each
>> type
>> >> of
>> >> user separately and then combining the result.
>> >>
>> >>
>> >> You might want to have a look at this way of optimizing count-distinct
>> >> queries where skew can be a problem -
>> >> https://issues.apache.org/jira/browse/PIG-1846
>> >>
>> >> -thejas
>> >>
>> >>
>> >>
>> >>
>> >>
>> >> On 5/12/11 10:43 AM, "sonia gehlot" <so...@gmail.com> wrote:
>> >>
>> >> > Hi Guys,
>> >> >
>> >> > I am running following Pig script in Pig 0.8 version
>> >> >
>> >> > page_events = LOAD '/user/sgehlot/day=2011-05-10' as
>> >> >
>> >>
>> (event_dt_ht:chararray,event_dt_ut:chararray,event_rec_num:int,event_type:int,
>> >> >
>> >>
>> client_ip_addr:long,hub_id:int,is_cookied_user:int,local_ontology_node_id:int,
>> >> >
>> >>
>> page_type_id:int,content_id:int,product_id:int,referrer_edition_id:int,page_nu
>> >> >
>> >>
>> mber:int,is_iab_robot:int,browser_id:int,os_id:int,dw_pubsys_id:int,refresh:in
>> >> >
>> >>
>> t,asset_id:int,asset_type_id:int,content_type_id:int,product_type_id:int,outbo
>> >> >
>> >>
>> und_email_id:long,gbal_clc:int,mtype:int,user_action_id:int,referring_partner_
>> >> >
>> >>
>> id:int,ontology_node_id:int,content_namespace_id:int,product_namespace_id:int,
>> >> >
>> >>
>> transparent_edition_id:int,default_edition_id:int,event_seq_num:int,is_last_pa
>> >> >
>> >>
>> ge:int,is_new_user:int,page_duration:int,page_seq_num:int,session_id:long,time
>> >> >
>> >>
>> _since_sess_start:int,reg_cookie:chararray,urs_app_id:int,is_reg_user:int,edit
>> >> >
>> >>
>> ion_id:int,user_agent_id:int,page_type_key:int,referrer_id:int,channel_id:int,
>> >> >
>> >>
>> level2_id:int,level3_id:int,brand_id:int,content_key:int,product_key:int,editi
>> >> >
>> >>
>> on_key:int,partner_key:int,business_unit_id:int,anon_cookie:chararray,machine_
>> >> >
>> >>
>> name:chararray,pagehost:chararray,filenameextension:chararray,referrerpath:cha
>> >> >
>> >>
>> rarray,referrerhost:chararray,referring_oid:chararray,referring_legacy_oid:cha
>> >> >
>> >>
>> rarray,ctype:chararray,cval:chararray,link_tag:chararray,link_type:chararray,s
>> >> >
>> >>
>> ticky_tag:chararray,page_url:chararray,search_category:chararray,partner_subje
>> >> >
>> >>
>> ct:chararray,referring_partner_name:chararray,robot_pattern:chararray,browser:
>> >> >
>> >>
>> chararray,browser_major_version:chararray,browser_minor_version:chararray,os:c
>> >> >
>> >>
>> hararray,os_family:chararray,ttag:chararray,dest_oid:chararray,global_id:chara
>> >> >
>> >>
>> rray,hostname:chararray,path:chararray,filename:chararray,extension:chararray,
>> >> >
>> >>
>> query:chararray,user_agent:chararray,xrq:chararray,xref:chararray,page_guid:ch
>> >> >
>> >>
>> ararray,test_name:chararray,test_group:chararray,test_version:chararray,page_v
>> >> >
>> >>
>> ersion:chararray,o_sticky_tag:chararray,new_referring_oid:chararray,day:charar
>> >> >
>> >>
>> ray,network_ip:int,site_id:int,search_phrase:chararray,search_attributes:chara
>> >> >
>> >>
>> rray,web_search_phrase:chararray,ip_address:chararray,is_pattern_match_robot:i
>> >> >
>> >>
>> nt,protocol:chararray,skc_title:chararray,skc_url:chararray,has_site_search_ph
>> >> >
>> >>
>> rase:int,has_site_search_attribs:int,has_web_search_phrase:int,title_id:charar
>> >> > ray,url_id:chararray,network_rev:int);
>> >> >
>> >> > referrer_group_map = LOAD
>> >> '/user/sgehlot/oozie/db_data/referrer_group_map'
>> >> > as
>> >> > (referrer_id:int, has_web_search_phrase:int, hostname:chararray,
>> >> > referral_type_id:int,
>> >> > referral_type_name:chararray,
>> >> >
>> >>
>> referrer_group_id:int,referrer_group_name:chararray,referrer_group_cat_id:int,
>> >> > referrer_group_cat:chararray);
>> >> >
>> >> > filter_pe = FILTER page_events BY is_iab_robot == 0 AND
>> >> > is_pattern_match_robot == 0 AND day == '2011-05-10';
>> >> >
>> >> > select_pe_col = FOREACH filter_pe GENERATE day, is_cookied_user,
>> >> > anon_cookie, reg_cookie, referrer_id, has_web_search_phrase,
>> >> > business_unit_id;
>> >> >
>> >> > select_ref_col = FOREACH referrer_group_map GENERATE referrer_id,
>> >> > has_web_search_phrase, referral_type_id;
>> >> >
>> >> > jn = JOIN select_ref_col BY (referrer_id, has_web_search_phrase),
>> >> > select_pe_col BY (referrer_id, has_web_search_phrase);
>> >> >
>> >> >
>> >> > logic = FOREACH jn GENERATE
>> >> > select_pe_col::day,
>> >> > select_ref_col::referral_type_id,
>> >> >  select_pe_col::business_unit_id,
>> >> > ((select_pe_col::is_cookied_user == 1) ? select_pe_col::anon_cookie :
>> >> null)
>> >> > as c_users,
>> >> >  ((select_pe_col::is_cookied_user == 0) ? select_pe_col::anon_cookie :
>> >> null)
>> >> > as nc_users,
>> >> > ((select_pe_col::reg_cookie == '-1') ? null :
>> select_pe_col::reg_cookie)
>> >> as
>> >> > registered_users;
>> >> >  group_it = GROUP logic BY (select_pe_col::day,
>> >> >    select_ref_col::referral_type_id,
>> >> >    select_pe_col::business_unit_id);
>> >> >
>> >> > agg_results = FOREACH group_it{
>> >> >  dst_c_users = DISTINCT logic.c_users;
>> >> > dst_nc_users = DISTINCT logic.nc_users;
>> >> >  dst_registered_users = DISTINCT logic.registered_users;
>> >> >       GENERATE
>> >> >        group.select_pe_col::day,
>> >> >         group.select_ref_col::referral_type_id,
>> >> > group.select_pe_col::business_unit_id,
>> >> >  COUNT(dst_c_users) as c_users,
>> >> > COUNT(dst_nc_users) as nc_users,
>> >> > COUNT(dst_registered_users) as registered_users;
>> >> >  };
>> >> >
>> >> > STORE agg_results INTO '/user/sgehlot/pt_users_referral_type_bu_day'
>> >> USING
>> >> > PigStorage('\t');
>> >> >
>> >> >
>> >> > But I am keep on failing because of Java out of memory error. Data set
>> in
>> >> > "page_events" is huge but in "referrer_group_map" is relatively very
>> >> less.
>> >> >
>> >> > Here is error message:
>> >> >
>> >> > Error: java.lang.OutOfMemoryError: GC overhead limit exceeded
>> >> > at java.util.ArrayList.(ArrayList.java:112)
>> >> >  at java.util.ArrayList.(ArrayList.java:119)
>> >> > at org.apache.pig.data.DefaultTuple.(DefaultTuple.java:59)
>> >> >  at org.apache.pig.data.BinSedesTuple.(BinSedesTuple.java:73)
>> >> > at
>> >> >
>> >>
>> >>
>> org.apache.pig.data.BinSedesTupleFactory.newTuple(BinSedesTupleFactory.java:33>
>> >> )
>> >> >  at
>> >> >
>> >>
>> org.apache.pig.data.InternalCachedBag$CachedBagIterator.hasNext(InternalCached
>> >> > Bag.java:236)
>> >> > at
>> >> >
>> >>
>> org.apache.pig.builtin.Distinct.getDistinctFromNestedBags(Distinct.java:136)
>> >> >  at org.apache.pig.builtin.Distinct.access$200(Distinct.java:38)
>> >> > at
>> org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:101)
>> >> >  at
>> org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:94)
>> >> > at
>> >> >
>> >>
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperator
>> >> > s.POUserFunc.getNext(POUserFunc.java:216)
>> >> >  at
>> >> >
>> >>
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperator
>> >> > s.POUserFunc.getNext(POUserFunc.java:253)
>> >> > at
>> >> >
>> >>
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.g
>> >> > etNext(PhysicalOperator.java:334)
>> >> >  at
>> >> >
>> >>
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
>> >> > s.POForEach.processPlan(POForEach.java:332)
>> >> > at
>> >> >
>> >>
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
>> >> > s.POForEach.getNext(POForEach.java:284)
>> >> >  at
>> >> >
>> >>
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.p
>> >> > rocessInput(PhysicalOperator.java:290)
>> >> > at
>> >> >
>> >>
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
>> >> > s.POLocalRearrange.getNext(POLocalRearrange.java:256)
>> >> >  at
>> >> >
>> >>
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> >> > ne.processOnePackageOutput(PigCombiner.java:184)
>> >> > at
>> >> >
>> >>
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> >> > ne.reduce(PigCombiner.java:162)
>> >> >  at
>> >> >
>> >>
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> >> > ne.reduce(PigCombiner.java:51)
>> >> > at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
>> >> >  at
>> >> org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
>> >> > at
>> >> >
>> >>
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:126
>> >> > 5)
>> >> >  at
>> >> >
>> >>
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
>> >> > at
>> >> >
>> >>
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:
>> >> > 1173)
>> >> >
>> >> > Error: java.lang.OutOfMemoryError: Java heap space
>> >> > at java.util.ArrayList.(ArrayList.java:112)
>> >> >  at java.util.ArrayList.(ArrayList.java:119)
>> >> > at org.apache.pig.data.DefaultDataBag.(DefaultDataBag.java:54)
>> >> >  at
>> >> >
>> >>
>> org.apache.pig.data.DefaultBagFactory.newDefaultBag(DefaultBagFactory.java:33)
>> >> > at org.apache.pig.data.BinInterSedes.readBag(BinInterSedes.java:143)
>> >> >  at
>> org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:275)
>> >> > at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
>> >> >  at
>> org.apache.pig.data.BinInterSedes.readTuple(BinInterSedes.java:111)
>> >> > at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:270)
>> >> >  at
>> org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
>> >> > at
>> >> org.apache.pig.data.BinInterSedes.addColsToTuple(BinInterSedes.java:555)
>> >> >  at
>> org.apache.pig.data.BinSedesTuple.readFields(BinSedesTuple.java:64)
>> >> > at
>> >> >
>> >>
>> org.apache.pig.impl.io.PigNullableWritable.readFields(PigNullableWritable.java
>> >> > :114)
>> >> >  at
>> >> >
>> >>
>> org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.des
>> >> > erialize(WritableSerialization.java:67)
>> >> > at
>> >> >
>> >>
>> org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.des
>> >> > erialize(WritableSerialization.java:40)
>> >> >  at
>> >> >
>> >>
>> org.apache.hadoop.mapreduce.ReduceContext.nextKeyValue(ReduceContext.java:116)
>> >> > at
>> >> >
>> >>
>> org.apache.hadoop.mapreduce.ReduceContext$ValueIterator.next(ReduceContext.jav
>> >> > a:163)
>> >> >  at
>> >> >
>> >>
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
>> >> > s.POCombinerPackage.getNext(POCombinerPackage.java:141)
>> >> > at
>> >> >
>> >>
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> >> > ne.processOnePackageOutput(PigCombiner.java:171)
>> >> >  at
>> >> >
>> >>
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> >> > ne.reduce(PigCombiner.java:162)
>> >> > at
>> >> >
>> >>
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> >> > ne.reduce(PigCombiner.java:51)
>> >> >  at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
>> >> > at
>> >> org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
>> >> >  at
>> >> >
>> >>
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:126
>> >> > 5)
>> >> > at
>> >> >
>> >>
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
>> >> >  at
>> >> >
>> >>
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:
>> >> > 1173)
>> >> >
>> >> > Any idea and suggestion what could be cause of error message?
>> >> >
>> >> > Thanks for any help,
>> >> > Sonia
>> >> >
>> >>
>> >>
>> >> --
>> >>
>> >>
>> >>
>> >
>>
>

Re: java.lang.OutOfMemoryError while running Pig Job

Posted by sonia gehlot <so...@gmail.com>.
Hey, I am sorry but I din't get how substring will help in this?

On Fri, May 20, 2011 at 1:08 PM, Xiaomeng Wan <sh...@gmail.com> wrote:

> you can try using some divide and conquer, like this:
>
> a = group data by (key, SUBSTRING(the_field_to_be_distinct, 0, 2));
> b = foreach a { x = distinct a.he_field_to_be_distinct; generate
> group.key as key, COUNT(x) as cnt; }
> c = group b by key;
> d = foreach c generate group as key, SUM(b.cnt) as cnt;
>
> using longer substring if still running into OOM.
>
> Regards,
> Shawn
>
> On Fri, May 20, 2011 at 1:11 PM, sonia gehlot <so...@gmail.com>
> wrote:
> > Hey Thejas,
> >
> > I tried setting up property pig.cachedbag.memusage to 0.1 and also tried
> > computing distinct count for each type separately but still I am getting
> > errors like
> >
> > Error: java.lang.OutOfMemoryError: Java heap space
> > Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
> > java.io.IOException: Spill failed
> >
> > Is there is some other way to do distinct - count that you may suggest?
> >
> > Thanks for your help.
> >
> > Sonia
> >
> > On Fri, May 13, 2011 at 4:46 PM, Thejas M Nair <te...@yahoo-inc.com>
> wrote:
> >
> >> The stack trace shows that the OOM error is happening when the distinct
> is
> >> being applied. It looks like in some record(s) of the relation group_it,
> >> one
> >> more of the following bags is very large - logic.c_users,
>  logic.nc_users
> >> or
> >> logic.registered_users;
> >>
> >> Try setting the property pig.cachedbag.memusage to 0.1 or lower (
> >> -Dpig.cachedbag.memusage=0.1 on java command line). It controls the
> memory
> >> used by pig internal bags, including those used by distinct.
> >>
> >> If that does not work, you can try computing count-distinct for each
> type
> >> of
> >> user separately and then combining the result.
> >>
> >>
> >> You might want to have a look at this way of optimizing count-distinct
> >> queries where skew can be a problem -
> >> https://issues.apache.org/jira/browse/PIG-1846
> >>
> >> -thejas
> >>
> >>
> >>
> >>
> >>
> >> On 5/12/11 10:43 AM, "sonia gehlot" <so...@gmail.com> wrote:
> >>
> >> > Hi Guys,
> >> >
> >> > I am running following Pig script in Pig 0.8 version
> >> >
> >> > page_events = LOAD '/user/sgehlot/day=2011-05-10' as
> >> >
> >>
> (event_dt_ht:chararray,event_dt_ut:chararray,event_rec_num:int,event_type:int,
> >> >
> >>
> client_ip_addr:long,hub_id:int,is_cookied_user:int,local_ontology_node_id:int,
> >> >
> >>
> page_type_id:int,content_id:int,product_id:int,referrer_edition_id:int,page_nu
> >> >
> >>
> mber:int,is_iab_robot:int,browser_id:int,os_id:int,dw_pubsys_id:int,refresh:in
> >> >
> >>
> t,asset_id:int,asset_type_id:int,content_type_id:int,product_type_id:int,outbo
> >> >
> >>
> und_email_id:long,gbal_clc:int,mtype:int,user_action_id:int,referring_partner_
> >> >
> >>
> id:int,ontology_node_id:int,content_namespace_id:int,product_namespace_id:int,
> >> >
> >>
> transparent_edition_id:int,default_edition_id:int,event_seq_num:int,is_last_pa
> >> >
> >>
> ge:int,is_new_user:int,page_duration:int,page_seq_num:int,session_id:long,time
> >> >
> >>
> _since_sess_start:int,reg_cookie:chararray,urs_app_id:int,is_reg_user:int,edit
> >> >
> >>
> ion_id:int,user_agent_id:int,page_type_key:int,referrer_id:int,channel_id:int,
> >> >
> >>
> level2_id:int,level3_id:int,brand_id:int,content_key:int,product_key:int,editi
> >> >
> >>
> on_key:int,partner_key:int,business_unit_id:int,anon_cookie:chararray,machine_
> >> >
> >>
> name:chararray,pagehost:chararray,filenameextension:chararray,referrerpath:cha
> >> >
> >>
> rarray,referrerhost:chararray,referring_oid:chararray,referring_legacy_oid:cha
> >> >
> >>
> rarray,ctype:chararray,cval:chararray,link_tag:chararray,link_type:chararray,s
> >> >
> >>
> ticky_tag:chararray,page_url:chararray,search_category:chararray,partner_subje
> >> >
> >>
> ct:chararray,referring_partner_name:chararray,robot_pattern:chararray,browser:
> >> >
> >>
> chararray,browser_major_version:chararray,browser_minor_version:chararray,os:c
> >> >
> >>
> hararray,os_family:chararray,ttag:chararray,dest_oid:chararray,global_id:chara
> >> >
> >>
> rray,hostname:chararray,path:chararray,filename:chararray,extension:chararray,
> >> >
> >>
> query:chararray,user_agent:chararray,xrq:chararray,xref:chararray,page_guid:ch
> >> >
> >>
> ararray,test_name:chararray,test_group:chararray,test_version:chararray,page_v
> >> >
> >>
> ersion:chararray,o_sticky_tag:chararray,new_referring_oid:chararray,day:charar
> >> >
> >>
> ray,network_ip:int,site_id:int,search_phrase:chararray,search_attributes:chara
> >> >
> >>
> rray,web_search_phrase:chararray,ip_address:chararray,is_pattern_match_robot:i
> >> >
> >>
> nt,protocol:chararray,skc_title:chararray,skc_url:chararray,has_site_search_ph
> >> >
> >>
> rase:int,has_site_search_attribs:int,has_web_search_phrase:int,title_id:charar
> >> > ray,url_id:chararray,network_rev:int);
> >> >
> >> > referrer_group_map = LOAD
> >> '/user/sgehlot/oozie/db_data/referrer_group_map'
> >> > as
> >> > (referrer_id:int, has_web_search_phrase:int, hostname:chararray,
> >> > referral_type_id:int,
> >> > referral_type_name:chararray,
> >> >
> >>
> referrer_group_id:int,referrer_group_name:chararray,referrer_group_cat_id:int,
> >> > referrer_group_cat:chararray);
> >> >
> >> > filter_pe = FILTER page_events BY is_iab_robot == 0 AND
> >> > is_pattern_match_robot == 0 AND day == '2011-05-10';
> >> >
> >> > select_pe_col = FOREACH filter_pe GENERATE day, is_cookied_user,
> >> > anon_cookie, reg_cookie, referrer_id, has_web_search_phrase,
> >> > business_unit_id;
> >> >
> >> > select_ref_col = FOREACH referrer_group_map GENERATE referrer_id,
> >> > has_web_search_phrase, referral_type_id;
> >> >
> >> > jn = JOIN select_ref_col BY (referrer_id, has_web_search_phrase),
> >> > select_pe_col BY (referrer_id, has_web_search_phrase);
> >> >
> >> >
> >> > logic = FOREACH jn GENERATE
> >> > select_pe_col::day,
> >> > select_ref_col::referral_type_id,
> >> >  select_pe_col::business_unit_id,
> >> > ((select_pe_col::is_cookied_user == 1) ? select_pe_col::anon_cookie :
> >> null)
> >> > as c_users,
> >> >  ((select_pe_col::is_cookied_user == 0) ? select_pe_col::anon_cookie :
> >> null)
> >> > as nc_users,
> >> > ((select_pe_col::reg_cookie == '-1') ? null :
> select_pe_col::reg_cookie)
> >> as
> >> > registered_users;
> >> >  group_it = GROUP logic BY (select_pe_col::day,
> >> >    select_ref_col::referral_type_id,
> >> >    select_pe_col::business_unit_id);
> >> >
> >> > agg_results = FOREACH group_it{
> >> >  dst_c_users = DISTINCT logic.c_users;
> >> > dst_nc_users = DISTINCT logic.nc_users;
> >> >  dst_registered_users = DISTINCT logic.registered_users;
> >> >       GENERATE
> >> >        group.select_pe_col::day,
> >> >         group.select_ref_col::referral_type_id,
> >> > group.select_pe_col::business_unit_id,
> >> >  COUNT(dst_c_users) as c_users,
> >> > COUNT(dst_nc_users) as nc_users,
> >> > COUNT(dst_registered_users) as registered_users;
> >> >  };
> >> >
> >> > STORE agg_results INTO '/user/sgehlot/pt_users_referral_type_bu_day'
> >> USING
> >> > PigStorage('\t');
> >> >
> >> >
> >> > But I am keep on failing because of Java out of memory error. Data set
> in
> >> > "page_events" is huge but in "referrer_group_map" is relatively very
> >> less.
> >> >
> >> > Here is error message:
> >> >
> >> > Error: java.lang.OutOfMemoryError: GC overhead limit exceeded
> >> > at java.util.ArrayList.(ArrayList.java:112)
> >> >  at java.util.ArrayList.(ArrayList.java:119)
> >> > at org.apache.pig.data.DefaultTuple.(DefaultTuple.java:59)
> >> >  at org.apache.pig.data.BinSedesTuple.(BinSedesTuple.java:73)
> >> > at
> >> >
> >>
> >>
> org.apache.pig.data.BinSedesTupleFactory.newTuple(BinSedesTupleFactory.java:33>
> >> )
> >> >  at
> >> >
> >>
> org.apache.pig.data.InternalCachedBag$CachedBagIterator.hasNext(InternalCached
> >> > Bag.java:236)
> >> > at
> >> >
> >>
> org.apache.pig.builtin.Distinct.getDistinctFromNestedBags(Distinct.java:136)
> >> >  at org.apache.pig.builtin.Distinct.access$200(Distinct.java:38)
> >> > at
> org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:101)
> >> >  at
> org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:94)
> >> > at
> >> >
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperator
> >> > s.POUserFunc.getNext(POUserFunc.java:216)
> >> >  at
> >> >
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperator
> >> > s.POUserFunc.getNext(POUserFunc.java:253)
> >> > at
> >> >
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.g
> >> > etNext(PhysicalOperator.java:334)
> >> >  at
> >> >
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> >> > s.POForEach.processPlan(POForEach.java:332)
> >> > at
> >> >
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> >> > s.POForEach.getNext(POForEach.java:284)
> >> >  at
> >> >
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.p
> >> > rocessInput(PhysicalOperator.java:290)
> >> > at
> >> >
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> >> > s.POLocalRearrange.getNext(POLocalRearrange.java:256)
> >> >  at
> >> >
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> > ne.processOnePackageOutput(PigCombiner.java:184)
> >> > at
> >> >
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> > ne.reduce(PigCombiner.java:162)
> >> >  at
> >> >
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> > ne.reduce(PigCombiner.java:51)
> >> > at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
> >> >  at
> >> org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
> >> > at
> >> >
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:126
> >> > 5)
> >> >  at
> >> >
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
> >> > at
> >> >
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:
> >> > 1173)
> >> >
> >> > Error: java.lang.OutOfMemoryError: Java heap space
> >> > at java.util.ArrayList.(ArrayList.java:112)
> >> >  at java.util.ArrayList.(ArrayList.java:119)
> >> > at org.apache.pig.data.DefaultDataBag.(DefaultDataBag.java:54)
> >> >  at
> >> >
> >>
> org.apache.pig.data.DefaultBagFactory.newDefaultBag(DefaultBagFactory.java:33)
> >> > at org.apache.pig.data.BinInterSedes.readBag(BinInterSedes.java:143)
> >> >  at
> org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:275)
> >> > at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
> >> >  at
> org.apache.pig.data.BinInterSedes.readTuple(BinInterSedes.java:111)
> >> > at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:270)
> >> >  at
> org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
> >> > at
> >> org.apache.pig.data.BinInterSedes.addColsToTuple(BinInterSedes.java:555)
> >> >  at
> org.apache.pig.data.BinSedesTuple.readFields(BinSedesTuple.java:64)
> >> > at
> >> >
> >>
> org.apache.pig.impl.io.PigNullableWritable.readFields(PigNullableWritable.java
> >> > :114)
> >> >  at
> >> >
> >>
> org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.des
> >> > erialize(WritableSerialization.java:67)
> >> > at
> >> >
> >>
> org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.des
> >> > erialize(WritableSerialization.java:40)
> >> >  at
> >> >
> >>
> org.apache.hadoop.mapreduce.ReduceContext.nextKeyValue(ReduceContext.java:116)
> >> > at
> >> >
> >>
> org.apache.hadoop.mapreduce.ReduceContext$ValueIterator.next(ReduceContext.jav
> >> > a:163)
> >> >  at
> >> >
> >>
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> >> > s.POCombinerPackage.getNext(POCombinerPackage.java:141)
> >> > at
> >> >
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> > ne.processOnePackageOutput(PigCombiner.java:171)
> >> >  at
> >> >
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> > ne.reduce(PigCombiner.java:162)
> >> > at
> >> >
> >>
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> >> > ne.reduce(PigCombiner.java:51)
> >> >  at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
> >> > at
> >> org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
> >> >  at
> >> >
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:126
> >> > 5)
> >> > at
> >> >
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
> >> >  at
> >> >
> >>
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:
> >> > 1173)
> >> >
> >> > Any idea and suggestion what could be cause of error message?
> >> >
> >> > Thanks for any help,
> >> > Sonia
> >> >
> >>
> >>
> >> --
> >>
> >>
> >>
> >
>

Re: java.lang.OutOfMemoryError while running Pig Job

Posted by Xiaomeng Wan <sh...@gmail.com>.
you can try using some divide and conquer, like this:

a = group data by (key, SUBSTRING(the_field_to_be_distinct, 0, 2));
b = foreach a { x = distinct a.he_field_to_be_distinct; generate
group.key as key, COUNT(x) as cnt; }
c = group b by key;
d = foreach c generate group as key, SUM(b.cnt) as cnt;

using longer substring if still running into OOM.

Regards,
Shawn

On Fri, May 20, 2011 at 1:11 PM, sonia gehlot <so...@gmail.com> wrote:
> Hey Thejas,
>
> I tried setting up property pig.cachedbag.memusage to 0.1 and also tried
> computing distinct count for each type separately but still I am getting
> errors like
>
> Error: java.lang.OutOfMemoryError: Java heap space
> Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
> java.io.IOException: Spill failed
>
> Is there is some other way to do distinct - count that you may suggest?
>
> Thanks for your help.
>
> Sonia
>
> On Fri, May 13, 2011 at 4:46 PM, Thejas M Nair <te...@yahoo-inc.com> wrote:
>
>> The stack trace shows that the OOM error is happening when the distinct is
>> being applied. It looks like in some record(s) of the relation group_it,
>> one
>> more of the following bags is very large - logic.c_users,  logic.nc_users
>> or
>> logic.registered_users;
>>
>> Try setting the property pig.cachedbag.memusage to 0.1 or lower (
>> -Dpig.cachedbag.memusage=0.1 on java command line). It controls the memory
>> used by pig internal bags, including those used by distinct.
>>
>> If that does not work, you can try computing count-distinct for each type
>> of
>> user separately and then combining the result.
>>
>>
>> You might want to have a look at this way of optimizing count-distinct
>> queries where skew can be a problem -
>> https://issues.apache.org/jira/browse/PIG-1846
>>
>> -thejas
>>
>>
>>
>>
>>
>> On 5/12/11 10:43 AM, "sonia gehlot" <so...@gmail.com> wrote:
>>
>> > Hi Guys,
>> >
>> > I am running following Pig script in Pig 0.8 version
>> >
>> > page_events = LOAD '/user/sgehlot/day=2011-05-10' as
>> >
>> (event_dt_ht:chararray,event_dt_ut:chararray,event_rec_num:int,event_type:int,
>> >
>> client_ip_addr:long,hub_id:int,is_cookied_user:int,local_ontology_node_id:int,
>> >
>> page_type_id:int,content_id:int,product_id:int,referrer_edition_id:int,page_nu
>> >
>> mber:int,is_iab_robot:int,browser_id:int,os_id:int,dw_pubsys_id:int,refresh:in
>> >
>> t,asset_id:int,asset_type_id:int,content_type_id:int,product_type_id:int,outbo
>> >
>> und_email_id:long,gbal_clc:int,mtype:int,user_action_id:int,referring_partner_
>> >
>> id:int,ontology_node_id:int,content_namespace_id:int,product_namespace_id:int,
>> >
>> transparent_edition_id:int,default_edition_id:int,event_seq_num:int,is_last_pa
>> >
>> ge:int,is_new_user:int,page_duration:int,page_seq_num:int,session_id:long,time
>> >
>> _since_sess_start:int,reg_cookie:chararray,urs_app_id:int,is_reg_user:int,edit
>> >
>> ion_id:int,user_agent_id:int,page_type_key:int,referrer_id:int,channel_id:int,
>> >
>> level2_id:int,level3_id:int,brand_id:int,content_key:int,product_key:int,editi
>> >
>> on_key:int,partner_key:int,business_unit_id:int,anon_cookie:chararray,machine_
>> >
>> name:chararray,pagehost:chararray,filenameextension:chararray,referrerpath:cha
>> >
>> rarray,referrerhost:chararray,referring_oid:chararray,referring_legacy_oid:cha
>> >
>> rarray,ctype:chararray,cval:chararray,link_tag:chararray,link_type:chararray,s
>> >
>> ticky_tag:chararray,page_url:chararray,search_category:chararray,partner_subje
>> >
>> ct:chararray,referring_partner_name:chararray,robot_pattern:chararray,browser:
>> >
>> chararray,browser_major_version:chararray,browser_minor_version:chararray,os:c
>> >
>> hararray,os_family:chararray,ttag:chararray,dest_oid:chararray,global_id:chara
>> >
>> rray,hostname:chararray,path:chararray,filename:chararray,extension:chararray,
>> >
>> query:chararray,user_agent:chararray,xrq:chararray,xref:chararray,page_guid:ch
>> >
>> ararray,test_name:chararray,test_group:chararray,test_version:chararray,page_v
>> >
>> ersion:chararray,o_sticky_tag:chararray,new_referring_oid:chararray,day:charar
>> >
>> ray,network_ip:int,site_id:int,search_phrase:chararray,search_attributes:chara
>> >
>> rray,web_search_phrase:chararray,ip_address:chararray,is_pattern_match_robot:i
>> >
>> nt,protocol:chararray,skc_title:chararray,skc_url:chararray,has_site_search_ph
>> >
>> rase:int,has_site_search_attribs:int,has_web_search_phrase:int,title_id:charar
>> > ray,url_id:chararray,network_rev:int);
>> >
>> > referrer_group_map = LOAD
>> '/user/sgehlot/oozie/db_data/referrer_group_map'
>> > as
>> > (referrer_id:int, has_web_search_phrase:int, hostname:chararray,
>> > referral_type_id:int,
>> > referral_type_name:chararray,
>> >
>> referrer_group_id:int,referrer_group_name:chararray,referrer_group_cat_id:int,
>> > referrer_group_cat:chararray);
>> >
>> > filter_pe = FILTER page_events BY is_iab_robot == 0 AND
>> > is_pattern_match_robot == 0 AND day == '2011-05-10';
>> >
>> > select_pe_col = FOREACH filter_pe GENERATE day, is_cookied_user,
>> > anon_cookie, reg_cookie, referrer_id, has_web_search_phrase,
>> > business_unit_id;
>> >
>> > select_ref_col = FOREACH referrer_group_map GENERATE referrer_id,
>> > has_web_search_phrase, referral_type_id;
>> >
>> > jn = JOIN select_ref_col BY (referrer_id, has_web_search_phrase),
>> > select_pe_col BY (referrer_id, has_web_search_phrase);
>> >
>> >
>> > logic = FOREACH jn GENERATE
>> > select_pe_col::day,
>> > select_ref_col::referral_type_id,
>> >  select_pe_col::business_unit_id,
>> > ((select_pe_col::is_cookied_user == 1) ? select_pe_col::anon_cookie :
>> null)
>> > as c_users,
>> >  ((select_pe_col::is_cookied_user == 0) ? select_pe_col::anon_cookie :
>> null)
>> > as nc_users,
>> > ((select_pe_col::reg_cookie == '-1') ? null : select_pe_col::reg_cookie)
>> as
>> > registered_users;
>> >  group_it = GROUP logic BY (select_pe_col::day,
>> >    select_ref_col::referral_type_id,
>> >    select_pe_col::business_unit_id);
>> >
>> > agg_results = FOREACH group_it{
>> >  dst_c_users = DISTINCT logic.c_users;
>> > dst_nc_users = DISTINCT logic.nc_users;
>> >  dst_registered_users = DISTINCT logic.registered_users;
>> >       GENERATE
>> >        group.select_pe_col::day,
>> >         group.select_ref_col::referral_type_id,
>> > group.select_pe_col::business_unit_id,
>> >  COUNT(dst_c_users) as c_users,
>> > COUNT(dst_nc_users) as nc_users,
>> > COUNT(dst_registered_users) as registered_users;
>> >  };
>> >
>> > STORE agg_results INTO '/user/sgehlot/pt_users_referral_type_bu_day'
>> USING
>> > PigStorage('\t');
>> >
>> >
>> > But I am keep on failing because of Java out of memory error. Data set in
>> > "page_events" is huge but in "referrer_group_map" is relatively very
>> less.
>> >
>> > Here is error message:
>> >
>> > Error: java.lang.OutOfMemoryError: GC overhead limit exceeded
>> > at java.util.ArrayList.(ArrayList.java:112)
>> >  at java.util.ArrayList.(ArrayList.java:119)
>> > at org.apache.pig.data.DefaultTuple.(DefaultTuple.java:59)
>> >  at org.apache.pig.data.BinSedesTuple.(BinSedesTuple.java:73)
>> > at
>> >
>>
>> org.apache.pig.data.BinSedesTupleFactory.newTuple(BinSedesTupleFactory.java:33>
>> )
>> >  at
>> >
>> org.apache.pig.data.InternalCachedBag$CachedBagIterator.hasNext(InternalCached
>> > Bag.java:236)
>> > at
>> >
>> org.apache.pig.builtin.Distinct.getDistinctFromNestedBags(Distinct.java:136)
>> >  at org.apache.pig.builtin.Distinct.access$200(Distinct.java:38)
>> > at org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:101)
>> >  at org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:94)
>> > at
>> >
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperator
>> > s.POUserFunc.getNext(POUserFunc.java:216)
>> >  at
>> >
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperator
>> > s.POUserFunc.getNext(POUserFunc.java:253)
>> > at
>> >
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.g
>> > etNext(PhysicalOperator.java:334)
>> >  at
>> >
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
>> > s.POForEach.processPlan(POForEach.java:332)
>> > at
>> >
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
>> > s.POForEach.getNext(POForEach.java:284)
>> >  at
>> >
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.p
>> > rocessInput(PhysicalOperator.java:290)
>> > at
>> >
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
>> > s.POLocalRearrange.getNext(POLocalRearrange.java:256)
>> >  at
>> >
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> > ne.processOnePackageOutput(PigCombiner.java:184)
>> > at
>> >
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> > ne.reduce(PigCombiner.java:162)
>> >  at
>> >
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> > ne.reduce(PigCombiner.java:51)
>> > at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
>> >  at
>> org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
>> > at
>> >
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:126
>> > 5)
>> >  at
>> >
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
>> > at
>> >
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:
>> > 1173)
>> >
>> > Error: java.lang.OutOfMemoryError: Java heap space
>> > at java.util.ArrayList.(ArrayList.java:112)
>> >  at java.util.ArrayList.(ArrayList.java:119)
>> > at org.apache.pig.data.DefaultDataBag.(DefaultDataBag.java:54)
>> >  at
>> >
>> org.apache.pig.data.DefaultBagFactory.newDefaultBag(DefaultBagFactory.java:33)
>> > at org.apache.pig.data.BinInterSedes.readBag(BinInterSedes.java:143)
>> >  at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:275)
>> > at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
>> >  at org.apache.pig.data.BinInterSedes.readTuple(BinInterSedes.java:111)
>> > at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:270)
>> >  at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
>> > at
>> org.apache.pig.data.BinInterSedes.addColsToTuple(BinInterSedes.java:555)
>> >  at org.apache.pig.data.BinSedesTuple.readFields(BinSedesTuple.java:64)
>> > at
>> >
>> org.apache.pig.impl.io.PigNullableWritable.readFields(PigNullableWritable.java
>> > :114)
>> >  at
>> >
>> org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.des
>> > erialize(WritableSerialization.java:67)
>> > at
>> >
>> org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.des
>> > erialize(WritableSerialization.java:40)
>> >  at
>> >
>> org.apache.hadoop.mapreduce.ReduceContext.nextKeyValue(ReduceContext.java:116)
>> > at
>> >
>> org.apache.hadoop.mapreduce.ReduceContext$ValueIterator.next(ReduceContext.jav
>> > a:163)
>> >  at
>> >
>> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
>> > s.POCombinerPackage.getNext(POCombinerPackage.java:141)
>> > at
>> >
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> > ne.processOnePackageOutput(PigCombiner.java:171)
>> >  at
>> >
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> > ne.reduce(PigCombiner.java:162)
>> > at
>> >
>> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
>> > ne.reduce(PigCombiner.java:51)
>> >  at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
>> > at
>> org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
>> >  at
>> >
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:126
>> > 5)
>> > at
>> >
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
>> >  at
>> >
>> org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:
>> > 1173)
>> >
>> > Any idea and suggestion what could be cause of error message?
>> >
>> > Thanks for any help,
>> > Sonia
>> >
>>
>>
>> --
>>
>>
>>
>

Re: java.lang.OutOfMemoryError while running Pig Job

Posted by sonia gehlot <so...@gmail.com>.
Hey Thejas,

I tried setting up property pig.cachedbag.memusage to 0.1 and also tried
computing distinct count for each type separately but still I am getting
errors like

Error: java.lang.OutOfMemoryError: Java heap space
Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
java.io.IOException: Spill failed

Is there is some other way to do distinct - count that you may suggest?

Thanks for your help.

Sonia

On Fri, May 13, 2011 at 4:46 PM, Thejas M Nair <te...@yahoo-inc.com> wrote:

> The stack trace shows that the OOM error is happening when the distinct is
> being applied. It looks like in some record(s) of the relation group_it,
> one
> more of the following bags is very large - logic.c_users,  logic.nc_users
> or
> logic.registered_users;
>
> Try setting the property pig.cachedbag.memusage to 0.1 or lower (
> -Dpig.cachedbag.memusage=0.1 on java command line). It controls the memory
> used by pig internal bags, including those used by distinct.
>
> If that does not work, you can try computing count-distinct for each type
> of
> user separately and then combining the result.
>
>
> You might want to have a look at this way of optimizing count-distinct
> queries where skew can be a problem -
> https://issues.apache.org/jira/browse/PIG-1846
>
> -thejas
>
>
>
>
>
> On 5/12/11 10:43 AM, "sonia gehlot" <so...@gmail.com> wrote:
>
> > Hi Guys,
> >
> > I am running following Pig script in Pig 0.8 version
> >
> > page_events = LOAD '/user/sgehlot/day=2011-05-10' as
> >
> (event_dt_ht:chararray,event_dt_ut:chararray,event_rec_num:int,event_type:int,
> >
> client_ip_addr:long,hub_id:int,is_cookied_user:int,local_ontology_node_id:int,
> >
> page_type_id:int,content_id:int,product_id:int,referrer_edition_id:int,page_nu
> >
> mber:int,is_iab_robot:int,browser_id:int,os_id:int,dw_pubsys_id:int,refresh:in
> >
> t,asset_id:int,asset_type_id:int,content_type_id:int,product_type_id:int,outbo
> >
> und_email_id:long,gbal_clc:int,mtype:int,user_action_id:int,referring_partner_
> >
> id:int,ontology_node_id:int,content_namespace_id:int,product_namespace_id:int,
> >
> transparent_edition_id:int,default_edition_id:int,event_seq_num:int,is_last_pa
> >
> ge:int,is_new_user:int,page_duration:int,page_seq_num:int,session_id:long,time
> >
> _since_sess_start:int,reg_cookie:chararray,urs_app_id:int,is_reg_user:int,edit
> >
> ion_id:int,user_agent_id:int,page_type_key:int,referrer_id:int,channel_id:int,
> >
> level2_id:int,level3_id:int,brand_id:int,content_key:int,product_key:int,editi
> >
> on_key:int,partner_key:int,business_unit_id:int,anon_cookie:chararray,machine_
> >
> name:chararray,pagehost:chararray,filenameextension:chararray,referrerpath:cha
> >
> rarray,referrerhost:chararray,referring_oid:chararray,referring_legacy_oid:cha
> >
> rarray,ctype:chararray,cval:chararray,link_tag:chararray,link_type:chararray,s
> >
> ticky_tag:chararray,page_url:chararray,search_category:chararray,partner_subje
> >
> ct:chararray,referring_partner_name:chararray,robot_pattern:chararray,browser:
> >
> chararray,browser_major_version:chararray,browser_minor_version:chararray,os:c
> >
> hararray,os_family:chararray,ttag:chararray,dest_oid:chararray,global_id:chara
> >
> rray,hostname:chararray,path:chararray,filename:chararray,extension:chararray,
> >
> query:chararray,user_agent:chararray,xrq:chararray,xref:chararray,page_guid:ch
> >
> ararray,test_name:chararray,test_group:chararray,test_version:chararray,page_v
> >
> ersion:chararray,o_sticky_tag:chararray,new_referring_oid:chararray,day:charar
> >
> ray,network_ip:int,site_id:int,search_phrase:chararray,search_attributes:chara
> >
> rray,web_search_phrase:chararray,ip_address:chararray,is_pattern_match_robot:i
> >
> nt,protocol:chararray,skc_title:chararray,skc_url:chararray,has_site_search_ph
> >
> rase:int,has_site_search_attribs:int,has_web_search_phrase:int,title_id:charar
> > ray,url_id:chararray,network_rev:int);
> >
> > referrer_group_map = LOAD
> '/user/sgehlot/oozie/db_data/referrer_group_map'
> > as
> > (referrer_id:int, has_web_search_phrase:int, hostname:chararray,
> > referral_type_id:int,
> > referral_type_name:chararray,
> >
> referrer_group_id:int,referrer_group_name:chararray,referrer_group_cat_id:int,
> > referrer_group_cat:chararray);
> >
> > filter_pe = FILTER page_events BY is_iab_robot == 0 AND
> > is_pattern_match_robot == 0 AND day == '2011-05-10';
> >
> > select_pe_col = FOREACH filter_pe GENERATE day, is_cookied_user,
> > anon_cookie, reg_cookie, referrer_id, has_web_search_phrase,
> > business_unit_id;
> >
> > select_ref_col = FOREACH referrer_group_map GENERATE referrer_id,
> > has_web_search_phrase, referral_type_id;
> >
> > jn = JOIN select_ref_col BY (referrer_id, has_web_search_phrase),
> > select_pe_col BY (referrer_id, has_web_search_phrase);
> >
> >
> > logic = FOREACH jn GENERATE
> > select_pe_col::day,
> > select_ref_col::referral_type_id,
> >  select_pe_col::business_unit_id,
> > ((select_pe_col::is_cookied_user == 1) ? select_pe_col::anon_cookie :
> null)
> > as c_users,
> >  ((select_pe_col::is_cookied_user == 0) ? select_pe_col::anon_cookie :
> null)
> > as nc_users,
> > ((select_pe_col::reg_cookie == '-1') ? null : select_pe_col::reg_cookie)
> as
> > registered_users;
> >  group_it = GROUP logic BY (select_pe_col::day,
> >    select_ref_col::referral_type_id,
> >    select_pe_col::business_unit_id);
> >
> > agg_results = FOREACH group_it{
> >  dst_c_users = DISTINCT logic.c_users;
> > dst_nc_users = DISTINCT logic.nc_users;
> >  dst_registered_users = DISTINCT logic.registered_users;
> >       GENERATE
> >        group.select_pe_col::day,
> >         group.select_ref_col::referral_type_id,
> > group.select_pe_col::business_unit_id,
> >  COUNT(dst_c_users) as c_users,
> > COUNT(dst_nc_users) as nc_users,
> > COUNT(dst_registered_users) as registered_users;
> >  };
> >
> > STORE agg_results INTO '/user/sgehlot/pt_users_referral_type_bu_day'
> USING
> > PigStorage('\t');
> >
> >
> > But I am keep on failing because of Java out of memory error. Data set in
> > "page_events" is huge but in "referrer_group_map" is relatively very
> less.
> >
> > Here is error message:
> >
> > Error: java.lang.OutOfMemoryError: GC overhead limit exceeded
> > at java.util.ArrayList.(ArrayList.java:112)
> >  at java.util.ArrayList.(ArrayList.java:119)
> > at org.apache.pig.data.DefaultTuple.(DefaultTuple.java:59)
> >  at org.apache.pig.data.BinSedesTuple.(BinSedesTuple.java:73)
> > at
> >
>
> org.apache.pig.data.BinSedesTupleFactory.newTuple(BinSedesTupleFactory.java:33>
> )
> >  at
> >
> org.apache.pig.data.InternalCachedBag$CachedBagIterator.hasNext(InternalCached
> > Bag.java:236)
> > at
> >
> org.apache.pig.builtin.Distinct.getDistinctFromNestedBags(Distinct.java:136)
> >  at org.apache.pig.builtin.Distinct.access$200(Distinct.java:38)
> > at org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:101)
> >  at org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:94)
> > at
> >
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperator
> > s.POUserFunc.getNext(POUserFunc.java:216)
> >  at
> >
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperator
> > s.POUserFunc.getNext(POUserFunc.java:253)
> > at
> >
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.g
> > etNext(PhysicalOperator.java:334)
> >  at
> >
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> > s.POForEach.processPlan(POForEach.java:332)
> > at
> >
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> > s.POForEach.getNext(POForEach.java:284)
> >  at
> >
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.p
> > rocessInput(PhysicalOperator.java:290)
> > at
> >
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> > s.POLocalRearrange.getNext(POLocalRearrange.java:256)
> >  at
> >
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> > ne.processOnePackageOutput(PigCombiner.java:184)
> > at
> >
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> > ne.reduce(PigCombiner.java:162)
> >  at
> >
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> > ne.reduce(PigCombiner.java:51)
> > at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
> >  at
> org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
> > at
> >
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:126
> > 5)
> >  at
> >
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
> > at
> >
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:
> > 1173)
> >
> > Error: java.lang.OutOfMemoryError: Java heap space
> > at java.util.ArrayList.(ArrayList.java:112)
> >  at java.util.ArrayList.(ArrayList.java:119)
> > at org.apache.pig.data.DefaultDataBag.(DefaultDataBag.java:54)
> >  at
> >
> org.apache.pig.data.DefaultBagFactory.newDefaultBag(DefaultBagFactory.java:33)
> > at org.apache.pig.data.BinInterSedes.readBag(BinInterSedes.java:143)
> >  at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:275)
> > at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
> >  at org.apache.pig.data.BinInterSedes.readTuple(BinInterSedes.java:111)
> > at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:270)
> >  at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
> > at
> org.apache.pig.data.BinInterSedes.addColsToTuple(BinInterSedes.java:555)
> >  at org.apache.pig.data.BinSedesTuple.readFields(BinSedesTuple.java:64)
> > at
> >
> org.apache.pig.impl.io.PigNullableWritable.readFields(PigNullableWritable.java
> > :114)
> >  at
> >
> org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.des
> > erialize(WritableSerialization.java:67)
> > at
> >
> org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.des
> > erialize(WritableSerialization.java:40)
> >  at
> >
> org.apache.hadoop.mapreduce.ReduceContext.nextKeyValue(ReduceContext.java:116)
> > at
> >
> org.apache.hadoop.mapreduce.ReduceContext$ValueIterator.next(ReduceContext.jav
> > a:163)
> >  at
> >
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> > s.POCombinerPackage.getNext(POCombinerPackage.java:141)
> > at
> >
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> > ne.processOnePackageOutput(PigCombiner.java:171)
> >  at
> >
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> > ne.reduce(PigCombiner.java:162)
> > at
> >
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> > ne.reduce(PigCombiner.java:51)
> >  at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
> > at
> org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
> >  at
> >
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:126
> > 5)
> > at
> >
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
> >  at
> >
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:
> > 1173)
> >
> > Any idea and suggestion what could be cause of error message?
> >
> > Thanks for any help,
> > Sonia
> >
>
>
> --
>
>
>

Re: java.lang.OutOfMemoryError while running Pig Job

Posted by Thejas M Nair <te...@yahoo-inc.com>.
The stack trace shows that the OOM error is happening when the distinct is
being applied. It looks like in some record(s) of the relation group_it, one
more of the following bags is very large - logic.c_users,  logic.nc_users or
logic.registered_users;

Try setting the property pig.cachedbag.memusage to 0.1 or lower (
-Dpig.cachedbag.memusage=0.1 on java command line). It controls the memory
used by pig internal bags, including those used by distinct.

If that does not work, you can try computing count-distinct for each type of
user separately and then combining the result.


You might want to have a look at this way of optimizing count-distinct
queries where skew can be a problem -
https://issues.apache.org/jira/browse/PIG-1846

-thejas





On 5/12/11 10:43 AM, "sonia gehlot" <so...@gmail.com> wrote:

> Hi Guys,
> 
> I am running following Pig script in Pig 0.8 version
> 
> page_events = LOAD '/user/sgehlot/day=2011-05-10' as
> (event_dt_ht:chararray,event_dt_ut:chararray,event_rec_num:int,event_type:int,
> client_ip_addr:long,hub_id:int,is_cookied_user:int,local_ontology_node_id:int,
> page_type_id:int,content_id:int,product_id:int,referrer_edition_id:int,page_nu
> mber:int,is_iab_robot:int,browser_id:int,os_id:int,dw_pubsys_id:int,refresh:in
> t,asset_id:int,asset_type_id:int,content_type_id:int,product_type_id:int,outbo
> und_email_id:long,gbal_clc:int,mtype:int,user_action_id:int,referring_partner_
> id:int,ontology_node_id:int,content_namespace_id:int,product_namespace_id:int,
> transparent_edition_id:int,default_edition_id:int,event_seq_num:int,is_last_pa
> ge:int,is_new_user:int,page_duration:int,page_seq_num:int,session_id:long,time
> _since_sess_start:int,reg_cookie:chararray,urs_app_id:int,is_reg_user:int,edit
> ion_id:int,user_agent_id:int,page_type_key:int,referrer_id:int,channel_id:int,
> level2_id:int,level3_id:int,brand_id:int,content_key:int,product_key:int,editi
> on_key:int,partner_key:int,business_unit_id:int,anon_cookie:chararray,machine_
> name:chararray,pagehost:chararray,filenameextension:chararray,referrerpath:cha
> rarray,referrerhost:chararray,referring_oid:chararray,referring_legacy_oid:cha
> rarray,ctype:chararray,cval:chararray,link_tag:chararray,link_type:chararray,s
> ticky_tag:chararray,page_url:chararray,search_category:chararray,partner_subje
> ct:chararray,referring_partner_name:chararray,robot_pattern:chararray,browser:
> chararray,browser_major_version:chararray,browser_minor_version:chararray,os:c
> hararray,os_family:chararray,ttag:chararray,dest_oid:chararray,global_id:chara
> rray,hostname:chararray,path:chararray,filename:chararray,extension:chararray,
> query:chararray,user_agent:chararray,xrq:chararray,xref:chararray,page_guid:ch
> ararray,test_name:chararray,test_group:chararray,test_version:chararray,page_v
> ersion:chararray,o_sticky_tag:chararray,new_referring_oid:chararray,day:charar
> ray,network_ip:int,site_id:int,search_phrase:chararray,search_attributes:chara
> rray,web_search_phrase:chararray,ip_address:chararray,is_pattern_match_robot:i
> nt,protocol:chararray,skc_title:chararray,skc_url:chararray,has_site_search_ph
> rase:int,has_site_search_attribs:int,has_web_search_phrase:int,title_id:charar
> ray,url_id:chararray,network_rev:int);
> 
> referrer_group_map = LOAD '/user/sgehlot/oozie/db_data/referrer_group_map'
> as
> (referrer_id:int, has_web_search_phrase:int, hostname:chararray,
> referral_type_id:int,
> referral_type_name:chararray,
> referrer_group_id:int,referrer_group_name:chararray,referrer_group_cat_id:int,
> referrer_group_cat:chararray);
> 
> filter_pe = FILTER page_events BY is_iab_robot == 0 AND
> is_pattern_match_robot == 0 AND day == '2011-05-10';
> 
> select_pe_col = FOREACH filter_pe GENERATE day, is_cookied_user,
> anon_cookie, reg_cookie, referrer_id, has_web_search_phrase,
> business_unit_id;
> 
> select_ref_col = FOREACH referrer_group_map GENERATE referrer_id,
> has_web_search_phrase, referral_type_id;
> 
> jn = JOIN select_ref_col BY (referrer_id, has_web_search_phrase),
> select_pe_col BY (referrer_id, has_web_search_phrase);
> 
> 
> logic = FOREACH jn GENERATE
> select_pe_col::day,
> select_ref_col::referral_type_id,
>  select_pe_col::business_unit_id,
> ((select_pe_col::is_cookied_user == 1) ? select_pe_col::anon_cookie : null)
> as c_users,
>  ((select_pe_col::is_cookied_user == 0) ? select_pe_col::anon_cookie : null)
> as nc_users,
> ((select_pe_col::reg_cookie == '-1') ? null : select_pe_col::reg_cookie) as
> registered_users;
>  group_it = GROUP logic BY (select_pe_col::day,
>    select_ref_col::referral_type_id,
>    select_pe_col::business_unit_id);
> 
> agg_results = FOREACH group_it{
>  dst_c_users = DISTINCT logic.c_users;
> dst_nc_users = DISTINCT logic.nc_users;
>  dst_registered_users = DISTINCT logic.registered_users;
>       GENERATE
>        group.select_pe_col::day,
>         group.select_ref_col::referral_type_id,
> group.select_pe_col::business_unit_id,
>  COUNT(dst_c_users) as c_users,
> COUNT(dst_nc_users) as nc_users,
> COUNT(dst_registered_users) as registered_users;
>  };
> 
> STORE agg_results INTO '/user/sgehlot/pt_users_referral_type_bu_day' USING
> PigStorage('\t');
> 
> 
> But I am keep on failing because of Java out of memory error. Data set in
> "page_events" is huge but in "referrer_group_map" is relatively very less.
> 
> Here is error message:
> 
> Error: java.lang.OutOfMemoryError: GC overhead limit exceeded
> at java.util.ArrayList.(ArrayList.java:112)
>  at java.util.ArrayList.(ArrayList.java:119)
> at org.apache.pig.data.DefaultTuple.(DefaultTuple.java:59)
>  at org.apache.pig.data.BinSedesTuple.(BinSedesTuple.java:73)
> at
> 
org.apache.pig.data.BinSedesTupleFactory.newTuple(BinSedesTupleFactory.java:33>
)
>  at
> org.apache.pig.data.InternalCachedBag$CachedBagIterator.hasNext(InternalCached
> Bag.java:236)
> at
> org.apache.pig.builtin.Distinct.getDistinctFromNestedBags(Distinct.java:136)
>  at org.apache.pig.builtin.Distinct.access$200(Distinct.java:38)
> at org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:101)
>  at org.apache.pig.builtin.Distinct$Intermediate.exec(Distinct.java:94)
> at
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperator
> s.POUserFunc.getNext(POUserFunc.java:216)
>  at
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperator
> s.POUserFunc.getNext(POUserFunc.java:253)
> at
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.g
> etNext(PhysicalOperator.java:334)
>  at
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> s.POForEach.processPlan(POForEach.java:332)
> at
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> s.POForEach.getNext(POForEach.java:284)
>  at
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator.p
> rocessInput(PhysicalOperator.java:290)
> at
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> s.POLocalRearrange.getNext(POLocalRearrange.java:256)
>  at
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> ne.processOnePackageOutput(PigCombiner.java:184)
> at
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> ne.reduce(PigCombiner.java:162)
>  at
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> ne.reduce(PigCombiner.java:51)
> at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
>  at org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
> at
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:126
> 5)
>  at
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
> at
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:
> 1173)
> 
> Error: java.lang.OutOfMemoryError: Java heap space
> at java.util.ArrayList.(ArrayList.java:112)
>  at java.util.ArrayList.(ArrayList.java:119)
> at org.apache.pig.data.DefaultDataBag.(DefaultDataBag.java:54)
>  at
> org.apache.pig.data.DefaultBagFactory.newDefaultBag(DefaultBagFactory.java:33)
> at org.apache.pig.data.BinInterSedes.readBag(BinInterSedes.java:143)
>  at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:275)
> at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
>  at org.apache.pig.data.BinInterSedes.readTuple(BinInterSedes.java:111)
> at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:270)
>  at org.apache.pig.data.BinInterSedes.readDatum(BinInterSedes.java:251)
> at org.apache.pig.data.BinInterSedes.addColsToTuple(BinInterSedes.java:555)
>  at org.apache.pig.data.BinSedesTuple.readFields(BinSedesTuple.java:64)
> at
> org.apache.pig.impl.io.PigNullableWritable.readFields(PigNullableWritable.java
> :114)
>  at
> org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.des
> erialize(WritableSerialization.java:67)
> at
> org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.des
> erialize(WritableSerialization.java:40)
>  at
> org.apache.hadoop.mapreduce.ReduceContext.nextKeyValue(ReduceContext.java:116)
> at
> org.apache.hadoop.mapreduce.ReduceContext$ValueIterator.next(ReduceContext.jav
> a:163)
>  at
> org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperator
> s.POCombinerPackage.getNext(POCombinerPackage.java:141)
> at
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> ne.processOnePackageOutput(PigCombiner.java:171)
>  at
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> ne.reduce(PigCombiner.java:162)
> at
> org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigCombiner$Combi
> ne.reduce(PigCombiner.java:51)
>  at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:176)
> at org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1222)
>  at
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:126
> 5)
> at
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$1800(MapTask.java:686)
>  at
> org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:
> 1173)
> 
> Any idea and suggestion what could be cause of error message?
> 
> Thanks for any help,
> Sonia
> 


--