You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@asterixdb.apache.org by "Wenhai (JIRA)" <ji...@apache.org> on 2016/06/17 16:42:05 UTC

[jira] [Created] (ASTERIXDB-1487) Fuzzy select-join on inverted index poses inconsistent results.

Wenhai created ASTERIXDB-1487:
---------------------------------

             Summary: Fuzzy select-join on inverted index poses inconsistent results.
                 Key: ASTERIXDB-1487
                 URL: https://issues.apache.org/jira/browse/ASTERIXDB-1487
             Project: Apache AsterixDB
          Issue Type: Bug
          Components: AsterixDB
         Environment: MAC 4 cores, 8GB memory. The current master till 3/17/2016.
            Reporter: Wenhai
            Priority: Critical


As shown in below. After we switching the two "for" branches of the fuzzy join over a select, the results are consistent.
Schema
{noformat}
drop dataverse test if exists;
create dataverse test;
use dataverse test;

create type DBLPNestedType as closed {
  id: int64,
  dblpid: string,
  title: string,
  authors: string,
  misc: string
}

create type DBLPType as closed {
  nested: DBLPNestedType
}

create type CSXNestedType as closed {
  id: int64,
  csxid: string,
  title: string,
  authors: string,
  misc: string
}

create type CSXType as closed {
  nested: CSXNestedType
}

create dataset DBLPtmp(DBLPNestedType) primary key id;
create dataset CSXtmp(CSXNestedType) primary key id;

create dataset DBLP(DBLPType) primary key nested.id;
create dataset CSX(CSXType) primary key nested.id;

use dataverse test;

load dataset DBLPtmp
using localfs
(("path"="asterix_nc1://data/dblp-small/dblp-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000")) pre-sorted;

load dataset CSXtmp
using localfs
(("path"="asterix_nc1://data/pub-small/csx-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));

insert into dataset DBLP(
        for $x in dataset DBLPtmp
        return {
                "nested": $x
        }
);

insert into dataset CSX(
        for $x in dataset CSXtmp
        return {
                "nested": $x
        }
);

{noformat}
Indexes
{noformat}
create index keyword_index on DBLP(nested.title) type keyword; 
create index keyword_indexdbauhors on DBLP(nested.authors) type keyword;
create index keyword_indexcsxauthors on CSX(nested.authors) type keyword;
{noformat}
The following query
{noformat}
use dataverse test;
set simthresholds '.1'
let $s := count(
for $o in dataset DBLP
for $t in dataset CSX
where contains($o.nested.title, "System") and word-tokens($o.nested.authors) ~= word-tokens($t.nested.authors)
return $o
)
return $s
{noformat}
will return 28, while the query
{noformat}
use dataverse test;
set simthresholds '.1'
let $s := count(
for $t in dataset CSX
for $o in dataset DBLP
where contains($o.nested.title, "System") and word-tokens($o.nested.authors) ~= word-tokens($t.nested.authors)
return $o
)
return $s
{noformat}
will return 3 or pose a error in a big dataset.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)