You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@hawq.apache.org by "Lei Chang (JIRA)" <ji...@apache.org> on 2015/11/21 04:04:11 UTC

[jira] [Closed] (HAWQ-62) Incorrect calculation of locus describing of hash table introduce core dump in OLAP query

     [ https://issues.apache.org/jira/browse/HAWQ-62?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Lei Chang closed HAWQ-62.
-------------------------

> Incorrect calculation of locus describing of hash table introduce core dump in OLAP query
> -----------------------------------------------------------------------------------------
>
>                 Key: HAWQ-62
>                 URL: https://issues.apache.org/jira/browse/HAWQ-62
>             Project: Apache HAWQ
>          Issue Type: Bug
>          Components: Core
>            Reporter: Ruilong Huo
>            Assignee: Hubert Zhang
>            Priority: Critical
>             Fix For: 2.0.0-beta-incubating
>
>
> During execution of OLAP query, core dump was generated. The root cause analysis shows that it was introduced by incorrect calculation of locus describing of hash table.
> The detailed symptom, reproduction steps, and recommended fix are as below:
> 1. Call stack of core dump
> {noformat}
> $ sudo gdb hawq/bin/postgres -c core.postgres.148248.1444900646.11.501.501
> ...
> Core was generated by `postgres: port 31000, gpadmin parquetolap [local] con123 [local] cmd1 SELECT  '.
> Program terminated with signal 11, Segmentation fault.
> #0  0x0000003c1ec0f5db in raise () from /lib64/libpthread.so.0
> Missing separate debuginfos, use: debuginfo-install hawq-2.0.0.0-17409.x86_64
> (gdb) bt
> #0  0x0000003c1ec0f5db in raise () from /lib64/libpthread.so.0
> #1  0x00000000009e76ac in SafeHandlerForSegvBusIll (processName=0xd52b4b "Master process", postgres_signal_arg=11) at elog.c:4497
> #2  0x00000000009e78d9 in StandardHandlerForSigillSigsegvSigbus_OnMainThread (processName=0xd52b4b "Master process", postgres_signal_arg=11) at elog.c:4575
> #3  0x00000000008fa17f in CdbProgramErrorHandler (postgres_signal_arg=11) at postgres.c:3403
> #4  <signal handler called>
> #5  0x000000000087dabf in get_sortgroupclause_tle (sortClause=0x0, targetList=0x7fa2fc880d38) at tlist.c:194
> #6  0x000000000087dce1 in get_sortgroupclause_expr (sortClause=0x0, targetList=0x7fa2fc880d38) at tlist.c:281
> #7  0x000000000087dd37 in get_sortgrouplist_exprs (sortClauses=0x7fa2fc8e1608, targetList=0x7fa2fc880d38) at tlist.c:302
> #8  0x0000000000aefd20 in add_motion_to_dqa_child (plan=0x7fa2fc881e28, root=0x7fa2fc8e20c0, motion_added=0x7fff1edc67df "") at cdbgroup.c:6801
> #9  0x0000000000ae6370 in join_dqa_coplan (root=0x7fa2fc8e20c0, ctx=0x7fff1edc6ab0, outer=0x7fa2fc881e28, dqa_index=1) at cdbgroup.c:2330
> #10 0x0000000000ae4d19 in make_three_stage_agg_plan (root=0x7fa2fc8e20c0, ctx=0x7fff1edc6ab0) at cdbgroup.c:1690
> #11 0x0000000000ae3110 in cdb_grouping_planner (root=0x7fa2fc8e20c0, agg_counts=0x7fff1edc6dd0, group_context=0x7fff1edc6d50) at cdbgroup.c:899
> #12 0x0000000000848bbb in grouping_planner (root=0x7fa2fc8e20c0, tuple_fraction=0) at planner.c:1800
> #13 0x0000000000847468 in subquery_planner (glob=0x2e6faa8, parse=0x7fa2fc8da5c0, parent_root=0x0, tuple_fraction=0, subroot=0x7fff1edc70d0, config=0x7fa2fc8e2048) at planner.c:1002
> #14 0x000000000084660e in standard_planner (parse=0x7fa2fc8da5c0, cursorOptions=0, boundParams=0x0) at planner.c:591
> #15 0x00000000008462c2 in resource_negotiator (parse=0x2e6df80, cursorOptions=0, boundParams=0x0, resourceLife=QRL_ONCE, result=0x7fff1edc7308) at planner.c:468
> #16 0x0000000000845ba7 in planner (parse=0x2e6df80, cursorOptions=0, boundParams=0x0, resourceLife=QRL_ONCE) at planner.c:303
> #17 0x00000000008f5b11 in pg_plan_query (querytree=0x2e6df80, boundParams=0x0, resource_life=QRL_ONCE) at postgres.c:816
> #18 0x00000000008f5c1c in pg_plan_queries (querytrees=0x7fa2fc8b19b8, boundParams=0x0, needSnapshot=0 '\000', resource_life=QRL_ONCE) at postgres.c:889
> #19 0x00000000008f7220 in exec_simple_query (
>     query_string=0x2e17f78 "SELECT sale.cn,sale.prc,sale.vn,sale.pn,GROUP_ID(), TO_CHAR(COALESCE(COVAR_POP(floor(sale.vn),floor(sale.cn)),0),'99999999.9999999'),TO_CHAR(COALESCE(COUNT(DISTINCT floor(sale.pn-sale.vn)),0),'9999999"..., seqServerHost=0x0, seqServerPort=-1) at postgres.c:1647
> #20 0x00000000008fc0e0 in PostgresMain (argc=4, argv=0x2d9af18, username=0x2d966b8 "gpadmin") at postgres.c:4691
> #21 0x000000000089ed72 in BackendRun (port=0x2d1beb0) at postmaster.c:5845
> #22 0x000000000089e1fc in BackendStartup (port=0x2d1beb0) at postmaster.c:5438
> #23 0x000000000089891a in ServerLoop () at postmaster.c:2140
> #24 0x00000000008979f0 in PostmasterMain (argc=9, argv=0x2d161b0) at postmaster.c:1432
> #25 0x00000000007b14b2 in main (argc=9, argv=0x2d161b0) at main.c:226
> {noformat}
> 2. Steps for reproduction
> (1) Schema and data
> {code}
> create table sale
> (
>         cn int not null,
>         vn int not null,
>         pn int not null,
>         dt date not null,
>         qty int not null,
>         prc float not null
>         --primary key (cn, vn, pn)
> ) WITH (appendonly=true, orientation = parquet) distributed by (cn,vn,pn);
> insert into sale values
>   ( 2, 40, 100, '1401-1-1', 1100, 2400),
>   ( 1, 10, 200, '1401-3-1', 1, 0),
>   ( 3, 40, 200, '1401-4-1', 1, 0),
>   ( 1, 20, 100, '1401-5-1', 1, 0),
>   ( 1, 30, 300, '1401-5-2', 1, 0),
>   ( 1, 50, 400, '1401-6-1', 1, 0),
>   ( 2, 50, 400, '1401-6-1', 1, 0),
>   ( 1, 30, 500, '1401-6-1', 12, 5),
>   ( 3, 30, 500, '1401-6-1', 12, 5),
>   ( 3, 30, 600, '1401-6-1', 12, 5),
>   ( 4, 40, 700, '1401-6-1', 1, 1),
>   ( 4, 40, 800, '1401-6-1', 1, 1);
> {code}
> (2) SQL statement
> {code}
> -- Query involving COVAR_POP() function
> SELECT sale.cn,sale.prc,sale.vn,sale.pn,GROUP_ID(), TO_CHAR(COALESCE(COVAR_POP(floor(sale.vn),floor(sale.cn)),0),'99999999.9999999'),TO_CHAR(COALESCE(COUNT(DISTINCT floor(sale.pn-sale.vn)),0),'99999999.9999999'),TO_CHAR(COALESCE(COUNT(DISTINCT floor(sale.qty)),0),'99999999.9999999'),TO_CHAR(COALESCE(VAR_SAMP(floor(sale.prc)),0),'99999999.9999999'),TO_CHAR(COALESCE(STDDEV(DISTINCT floor(sale.qty-sale.pn)),0),'99999999.9999999'),TO_CHAR(COALESCE(MIN(floor(sale.pn)),0),'99999999.9999999'),TO_CHAR(COALESCE(STDDEV_SAMP(floor(sale.cn)),0),'99999999.9999999'),TO_CHAR(COALESCE(VAR_SAMP(floor(sale.prc+sale.prc)),0),'99999999.9999999'),TO_CHAR(COALESCE(STDDEV_SAMP(floor(sale.qty)),0),'99999999.9999999'),TO_CHAR(COALESCE(SUM(floor(sale.cn-sale.pn)),0),'99999999.9999999')
> FROM sale
> GROUP BY (sale.cn,sale.pn),(sale.dt,sale.cn),(sale.dt,sale.cn),(sale.vn),(sale.pn,sale.qty,sale.pn),CUBE((sale.vn,sale.cn),(sale.cn,sale.qty),(sale.vn)),(),sale.prc;
> {code}
> (3) psql output:
> {noformat}
> psql:query197.sql:5: server closed the connection unexpectedly
>         This probably means the server terminated abnormally
>         before or while processing the request.
> psql:query197.sql:5: connection to server was lost
> {noformat}
> 3. Potential fix
> {noformat}
> rhuo-mbp:apache-hawq rhuo$ git diff src/backend/cdb/cdbpathlocus.c
> diff --git a/src/backend/cdb/cdbpathlocus.c b/src/backend/cdb/cdbpathlocus.c
> index 2471fd4..9fdd485 100644
> --- a/src/backend/cdb/cdbpathlocus.c
> +++ b/src/backend/cdb/cdbpathlocus.c
> @@ -303,8 +303,9 @@ cdbpathlocus_from_baserel(struct PlannerInfo   *root,
>                                         }
>                                 }
>                         }
> -        if (isRelationRuntimeHash && (policy->nattrs > 0) && allocatedResource &&
> -            (list_length(root->glob->resource->segments) == (policy->bucketnum)))
> +        if (isRelationRuntimeHash && (policy->nattrs > 0) && ((allocatedResource &&
> +            (list_length(root->glob->resource->segments) == (policy->bucketnum))) ||
> +            (!allocatedResource)))
>          {
>                 List *partkey = cdb_build_distribution_pathkeys(root,
>                                                                 rel,
> {noformat}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)