You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@asterixdb.apache.org by "Wenhai (JIRA)" <ji...@apache.org> on 2016/06/17 16:42:05 UTC
[jira] [Created] (ASTERIXDB-1487) Fuzzy select-join on inverted
index poses inconsistent results.
Wenhai created ASTERIXDB-1487:
---------------------------------
Summary: Fuzzy select-join on inverted index poses inconsistent results.
Key: ASTERIXDB-1487
URL: https://issues.apache.org/jira/browse/ASTERIXDB-1487
Project: Apache AsterixDB
Issue Type: Bug
Components: AsterixDB
Environment: MAC 4 cores, 8GB memory. The current master till 3/17/2016.
Reporter: Wenhai
Priority: Critical
As shown in below. After we switching the two "for" branches of the fuzzy join over a select, the results are consistent.
Schema
{noformat}
drop dataverse test if exists;
create dataverse test;
use dataverse test;
create type DBLPNestedType as closed {
id: int64,
dblpid: string,
title: string,
authors: string,
misc: string
}
create type DBLPType as closed {
nested: DBLPNestedType
}
create type CSXNestedType as closed {
id: int64,
csxid: string,
title: string,
authors: string,
misc: string
}
create type CSXType as closed {
nested: CSXNestedType
}
create dataset DBLPtmp(DBLPNestedType) primary key id;
create dataset CSXtmp(CSXNestedType) primary key id;
create dataset DBLP(DBLPType) primary key nested.id;
create dataset CSX(CSXType) primary key nested.id;
use dataverse test;
load dataset DBLPtmp
using localfs
(("path"="asterix_nc1://data/dblp-small/dblp-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000")) pre-sorted;
load dataset CSXtmp
using localfs
(("path"="asterix_nc1://data/pub-small/csx-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
insert into dataset DBLP(
for $x in dataset DBLPtmp
return {
"nested": $x
}
);
insert into dataset CSX(
for $x in dataset CSXtmp
return {
"nested": $x
}
);
{noformat}
Indexes
{noformat}
create index keyword_index on DBLP(nested.title) type keyword;
create index keyword_indexdbauhors on DBLP(nested.authors) type keyword;
create index keyword_indexcsxauthors on CSX(nested.authors) type keyword;
{noformat}
The following query
{noformat}
use dataverse test;
set simthresholds '.1'
let $s := count(
for $o in dataset DBLP
for $t in dataset CSX
where contains($o.nested.title, "System") and word-tokens($o.nested.authors) ~= word-tokens($t.nested.authors)
return $o
)
return $s
{noformat}
will return 28, while the query
{noformat}
use dataverse test;
set simthresholds '.1'
let $s := count(
for $t in dataset CSX
for $o in dataset DBLP
where contains($o.nested.title, "System") and word-tokens($o.nested.authors) ~= word-tokens($t.nested.authors)
return $o
)
return $s
{noformat}
will return 3 or pose a error in a big dataset.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)