You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@asterixdb.apache.org by "Taewoo Kim (JIRA)" <ji...@apache.org> on 2017/12/29 18:56:00 UTC
[jira] [Updated] (ASTERIXDB-2215) Filter is not properly applied for a secondary inverted index search

     [ https://issues.apache.org/jira/browse/ASTERIXDB-2215?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Taewoo Kim updated ASTERIXDB-2215:
----------------------------------
    Description: 
Based on the way of writing predicate conditions on a field with filter, the generated plan does not correctly show min and max value of a filter.

{code}
drop dataverse twitter if exists;
create dataverse twitter if not exists;
use dataverse twitter;

create type typeUser if not exists as open {
    id: int64,
    name: string,
    screen_name : string,
    profile_image_url : string,
    lang : string,
    location: string,
    create_at: date,
    description: string,
    followers_count: int32,
    friends_count: int32,
    statues_count: int64
};

create type typePlace if not exists as open{
    country : string,
    country_code : string,
    full_name : string,
    id : string,
    name : string,
    place_type : string,
    bounding_box : rectangle
};

create type typeGeoTag if not exists as open {
    stateID: int32,
    stateName: string,
    countyID: int32,
    countyName: string,
    cityID: int32?,
    cityName: string?
};

create type typeTweet if not exists as open {
    create_at : datetime,
    id: int64,
    "text": string,
    in_reply_to_status : int64,
    in_reply_to_user : int64,
    favorite_count : int64,
    coordinate: point?,
    retweet_count : int64,
    lang : string,
    is_retweet: boolean,
    hashtags : {{ string }} ?,
    user_mentions : {{ int64 }} ? ,
    user : typeUser,
    place : typePlace?,
    geo_tag: typeGeoTag
};

create dataset ds_tweet(typeTweet) if not exists primary key id with filter on create_at;
{code}

For the following query, the logical plan shows empty min[] and two variables in max[] when doing an inverted-index search. 

{code}
USE twitter;
SELECT spatial_cell(get_points(place.bounding_box)[0], create_point(0.0,0.0),1.0,1.0) AS cell, count(*) AS cnt FROM ds_tweet
WHERE ftcontains(text, ['trump'], {'mode':'any'}) AND place.bounding_box IS NOT unknown 
AND datetime('2017-02-25T00:00:00') <= create_at AND  create_at < datetime('2017-02-26T00:00:00')
GROUP BY cell;
{code}

Exact predicates on the filter
{code}
datetime('2017-02-25T00:00:00') <= create_at AND  create_at < datetime('2017-02-26T00:00:00')
{code}

{code}
unnest-map [$$64, $$69, $$70] <- index-search("text_idx", 2, "twitter", "ds_tweet", FALSE, FALSE, 5, null, 21, TRUE, 1, $$63) with filter on min:[] max:[$$67, $$68]
                                        -- SINGLE_PARTITION_INVERTED_INDEX_SEARCH  |PARTITIONED|
                                          exchange
                                          -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                            assign [$$67, $$68, $$63] <- [datetime: { 2017-02-26T00:00:00.000Z }, datetime: { 2017-02-25T00:00:00.000Z }, array: [ "trump" ]]
                                            -- ASSIGN  |PARTITIONED|
                                              empty-tuple-source
                                              -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
{code}


However, for the following query(just switched the location of datetime and create_at at the end of the predicates), it shows another incorrect plan.

{code}
SELECT spatial_cell(get_points(place.bounding_box)[0], create_point(0.0,0.0),1.0,1.0) AS cell, count(*) AS cnt FROM ds_tweet
WHERE ftcontains(text, ['trump'], {'mode':'any'}) AND place.bounding_box IS NOT unknown 
AND datetime('2017-02-25T00:00:00') <= create_at AND  datetime('2017-02-26T00:00:00') > create_at
GROUP BY cell;
{code}

Exact predicates on the filter:
{code}
datetime('2017-02-25T00:00:00') <= create_at AND  datetime('2017-02-26T00:00:00') > create_at
{code}

{code}
unnest-map [$$64, $$69, $$70] <- index-search("text_idx", 2, "twitter", "ds_tweet", FALSE, FALSE, 5, null, 21, TRUE, 1, $$63) with filter on min:[$$67] max:[$$68]
                                        -- SINGLE_PARTITION_INVERTED_INDEX_SEARCH  |PARTITIONED|
                                          exchange
                                          -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                            assign [$$67, $$68, $$63] <- [datetime: { 2017-02-26T00:00:00.000Z }, datetime: { 2017-02-25T00:00:00.000Z }, array: [ "trump" ]]
                                            -- ASSIGN  |PARTITIONED|
                                              empty-tuple-source
                                              -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
{code}

  was:
Based on the way of writing predicate conditions on a field with filter, the generated plan sometimes is correct and sometimes not.

{code}
drop dataverse twitter if exists;
create dataverse twitter if not exists;
use dataverse twitter;

create type typeUser if not exists as open {
    id: int64,
    name: string,
    screen_name : string,
    profile_image_url : string,
    lang : string,
    location: string,
    create_at: date,
    description: string,
    followers_count: int32,
    friends_count: int32,
    statues_count: int64
};

create type typePlace if not exists as open{
    country : string,
    country_code : string,
    full_name : string,
    id : string,
    name : string,
    place_type : string,
    bounding_box : rectangle
};

create type typeGeoTag if not exists as open {
    stateID: int32,
    stateName: string,
    countyID: int32,
    countyName: string,
    cityID: int32?,
    cityName: string?
};

create type typeTweet if not exists as open {
    create_at : datetime,
    id: int64,
    "text": string,
    in_reply_to_status : int64,
    in_reply_to_user : int64,
    favorite_count : int64,
    coordinate: point?,
    retweet_count : int64,
    lang : string,
    is_retweet: boolean,
    hashtags : {{ string }} ?,
    user_mentions : {{ int64 }} ? ,
    user : typeUser,
    place : typePlace?,
    geo_tag: typeGeoTag
};

create dataset ds_tweet(typeTweet) if not exists primary key id with filter on create_at;
{code}

For the following query, the logical plan shows empty min[] and two variables in max[] when doing an inverted-index search. 

{code}
USE twitter;
SELECT spatial_cell(get_points(place.bounding_box)[0], create_point(0.0,0.0),1.0,1.0) AS cell, count(*) AS cnt FROM ds_tweet
WHERE ftcontains(text, ['trump'], {'mode':'any'}) AND place.bounding_box IS NOT unknown 
AND datetime('2017-02-25T00:00:00') <= create_at AND  create_at < datetime('2017-02-26T00:00:00')
GROUP BY cell;
{code}

Exact predicates on the filter
{code}
datetime('2017-02-25T00:00:00') <= create_at AND  create_at < datetime('2017-02-26T00:00:00')
{code}

{code}
unnest-map [$$64, $$69, $$70] <- index-search("text_idx", 2, "twitter", "ds_tweet", FALSE, FALSE, 5, null, 21, TRUE, 1, $$63) with filter on min:[] max:[$$67, $$68]
                                        -- SINGLE_PARTITION_INVERTED_INDEX_SEARCH  |PARTITIONED|
                                          exchange
                                          -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                            assign [$$67, $$68, $$63] <- [datetime: { 2017-02-26T00:00:00.000Z }, datetime: { 2017-02-25T00:00:00.000Z }, array: [ "trump" ]]
                                            -- ASSIGN  |PARTITIONED|
                                              empty-tuple-source
                                              -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
{code}


However, for the following query(just switched the location of datetime and create_at at the end of the predicates), it shows the correct plan.

{code}
SELECT spatial_cell(get_points(place.bounding_box)[0], create_point(0.0,0.0),1.0,1.0) AS cell, count(*) AS cnt FROM ds_tweet
WHERE ftcontains(text, ['trump'], {'mode':'any'}) AND place.bounding_box IS NOT unknown 
AND datetime('2017-02-25T00:00:00') <= create_at AND  datetime('2017-02-26T00:00:00') > create_at
GROUP BY cell;
{code}

Exact predicates on the filter:
{code}
datetime('2017-02-25T00:00:00') <= create_at AND  datetime('2017-02-26T00:00:00') > create_at
{code}

{code}
unnest-map [$$64, $$69, $$70] <- index-search("text_idx", 2, "twitter", "ds_tweet", FALSE, FALSE, 5, null, 21, TRUE, 1, $$63) with filter on min:[$$67] max:[$$68]
                                        -- SINGLE_PARTITION_INVERTED_INDEX_SEARCH  |PARTITIONED|
                                          exchange
                                          -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                            assign [$$67, $$68, $$63] <- [datetime: { 2017-02-26T00:00:00.000Z }, datetime: { 2017-02-25T00:00:00.000Z }, array: [ "trump" ]]
                                            -- ASSIGN  |PARTITIONED|
                                              empty-tuple-source
                                              -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
{code}


> Filter is not properly applied for a secondary inverted index search
> --------------------------------------------------------------------
>
>                 Key: ASTERIXDB-2215
>                 URL: https://issues.apache.org/jira/browse/ASTERIXDB-2215
>             Project: Apache AsterixDB
>          Issue Type: Bug
>            Reporter: Taewoo Kim
>
> Based on the way of writing predicate conditions on a field with filter, the generated plan does not correctly show min and max value of a filter.
> {code}
> drop dataverse twitter if exists;
> create dataverse twitter if not exists;
> use dataverse twitter;
> create type typeUser if not exists as open {
>     id: int64,
>     name: string,
>     screen_name : string,
>     profile_image_url : string,
>     lang : string,
>     location: string,
>     create_at: date,
>     description: string,
>     followers_count: int32,
>     friends_count: int32,
>     statues_count: int64
> };
> create type typePlace if not exists as open{
>     country : string,
>     country_code : string,
>     full_name : string,
>     id : string,
>     name : string,
>     place_type : string,
>     bounding_box : rectangle
> };
> create type typeGeoTag if not exists as open {
>     stateID: int32,
>     stateName: string,
>     countyID: int32,
>     countyName: string,
>     cityID: int32?,
>     cityName: string?
> };
> create type typeTweet if not exists as open {
>     create_at : datetime,
>     id: int64,
>     "text": string,
>     in_reply_to_status : int64,
>     in_reply_to_user : int64,
>     favorite_count : int64,
>     coordinate: point?,
>     retweet_count : int64,
>     lang : string,
>     is_retweet: boolean,
>     hashtags : {{ string }} ?,
>     user_mentions : {{ int64 }} ? ,
>     user : typeUser,
>     place : typePlace?,
>     geo_tag: typeGeoTag
> };
> create dataset ds_tweet(typeTweet) if not exists primary key id with filter on create_at;
> {code}
> For the following query, the logical plan shows empty min[] and two variables in max[] when doing an inverted-index search. 
> {code}
> USE twitter;
> SELECT spatial_cell(get_points(place.bounding_box)[0], create_point(0.0,0.0),1.0,1.0) AS cell, count(*) AS cnt FROM ds_tweet
> WHERE ftcontains(text, ['trump'], {'mode':'any'}) AND place.bounding_box IS NOT unknown 
> AND datetime('2017-02-25T00:00:00') <= create_at AND  create_at < datetime('2017-02-26T00:00:00')
> GROUP BY cell;
> {code}
> Exact predicates on the filter
> {code}
> datetime('2017-02-25T00:00:00') <= create_at AND  create_at < datetime('2017-02-26T00:00:00')
> {code}
> {code}
> unnest-map [$$64, $$69, $$70] <- index-search("text_idx", 2, "twitter", "ds_tweet", FALSE, FALSE, 5, null, 21, TRUE, 1, $$63) with filter on min:[] max:[$$67, $$68]
>                                         -- SINGLE_PARTITION_INVERTED_INDEX_SEARCH  |PARTITIONED|
>                                           exchange
>                                           -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                                             assign [$$67, $$68, $$63] <- [datetime: { 2017-02-26T00:00:00.000Z }, datetime: { 2017-02-25T00:00:00.000Z }, array: [ "trump" ]]
>                                             -- ASSIGN  |PARTITIONED|
>                                               empty-tuple-source
>                                               -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
> {code}
> However, for the following query(just switched the location of datetime and create_at at the end of the predicates), it shows another incorrect plan.
> {code}
> SELECT spatial_cell(get_points(place.bounding_box)[0], create_point(0.0,0.0),1.0,1.0) AS cell, count(*) AS cnt FROM ds_tweet
> WHERE ftcontains(text, ['trump'], {'mode':'any'}) AND place.bounding_box IS NOT unknown 
> AND datetime('2017-02-25T00:00:00') <= create_at AND  datetime('2017-02-26T00:00:00') > create_at
> GROUP BY cell;
> {code}
> Exact predicates on the filter:
> {code}
> datetime('2017-02-25T00:00:00') <= create_at AND  datetime('2017-02-26T00:00:00') > create_at
> {code}
> {code}
> unnest-map [$$64, $$69, $$70] <- index-search("text_idx", 2, "twitter", "ds_tweet", FALSE, FALSE, 5, null, 21, TRUE, 1, $$63) with filter on min:[$$67] max:[$$68]
>                                         -- SINGLE_PARTITION_INVERTED_INDEX_SEARCH  |PARTITIONED|
>                                           exchange
>                                           -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
>                                             assign [$$67, $$68, $$63] <- [datetime: { 2017-02-26T00:00:00.000Z }, datetime: { 2017-02-25T00:00:00.000Z }, array: [ "trump" ]]
>                                             -- ASSIGN  |PARTITIONED|
>                                               empty-tuple-source
>                                               -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
> {code}



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)