You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@kylin.apache.org by "Liu Zhao (Jira)" <ji...@apache.org> on 2022/12/19 03:52:00 UTC
[jira] [Updated] (KYLIN-5371) Kylin4 在多分区查询bug
[ https://issues.apache.org/jira/browse/KYLIN-5371?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Liu Zhao updated KYLIN-5371:
----------------------------
Attachment: image-2022-12-19-11-49-48-323.png
Description:
在创建model时如果增量构建时指定了两个partition列,date 和 hour,构建没有问题,但在查询时如果where只指定 = 某个date值,查询结果非预期值。
// pdate, phour 都是分区列,在创建model时也指定为partition,详情见附件图片
--q1:
select pdate, phour, count(1) from lz_test_partition where pdate = '2022-12-19' group by pdate, phour
--q2:
select pdate, phour, count(1) from lz_test_partition group by pdate, phour
查看源码,bug 出现在 org.apache.spark.sql.execution.datasource.SegFilters#foldFilter 和 org.apache.spark.sql.execution.datasource.SegFilters#insurance 中,一处只用日期判断一处用到time级判断。
{code:java}
case class SegFilters(start: Long, end: Long, pattern: String) extends Logging {
private def insurance(value: Any)
(func: Long => Filter): Filter = {
value match {
case v: Date =>
// see SPARK-27546
val ts = DateFormat.stringToMillis(v.toString)
func(ts)
case v @ (_:String | _: Int | _: Long) if pattern != null =>
val format = DateFormat.getDateFormat(pattern)
val time = format.parse(v.toString).getTime
func(time)
case v: Timestamp =>
func(v.getTime)
case _ =>
Trivial(true)
}
}
/**
* Recursively fold provided filters to trivial,
* blocks are always non-empty.
*/
def foldFilter(filter: Filter): Filter = {
filter match {
case EqualTo(_, value: Any) =>
insurance(value) {
ts => Trivial(ts >= start && ts < end) --注意在这个地方是有问题的,ts 是date,但start 和 end 可以是到time级,因此在这里的过滤会丢
}
case In(_, values: Array[Any]) =>
val satisfied = values.map(v => insurance(v) {
ts => Trivial(ts >= start && ts < end)
}).exists(_.equals(Trivial(true)))
Trivial(satisfied)
case IsNull(_) =>
Trivial(false)
case IsNotNull(_) =>
Trivial(true)
case GreaterThan(_, value: Any) =>
insurance(value) {
ts => Trivial(ts < end)
}
case GreaterThanOrEqual(_, value: Any) =>
insurance(value) {
ts => Trivial(ts < end)
}
case LessThan(_, value: Any) =>
insurance(value) {
ts => Trivial(ts > start)
}
case LessThanOrEqual(_, value: Any) =>
insurance(value) {
ts => Trivial(ts >= start)
}
case And(left: Filter, right: Filter) =>
And(foldFilter(left), foldFilter(right)) match {
case And(AlwaysFalse, _) => Trivial(false)
case And(_, AlwaysFalse) => Trivial(false)
case And(AlwaysTrue, right) => right
case And(left, AlwaysTrue) => left
case other => other
}
case Or(left: Filter, right: Filter) =>
Or(foldFilter(left), foldFilter(right)) match {
case Or(AlwaysTrue, _) => Trivial(true)
case Or(_, AlwaysTrue) => Trivial(true)
case Or(AlwaysFalse, right) => right
case Or(left, AlwaysFalse) => left
case other => other
}
case unsupportedFilter =>
// return 'true' to scan all partitions
// currently unsupported filters are:
// - StringStartsWith
// - StringEndsWith
// - StringContains
// - EqualNullSafe
Trivial(true)
}
}
def Trivial(value: Boolean): Filter = {
if (value) AlwaysTrue else AlwaysFalse
}
}
{code}
详情及原因看附件图片:
!image-2022-12-19-11-49-48-323.png!
!image-2022-12-19-11-33-36-654.png!
!image-2022-12-19-11-34-06-372.png!
!image-2022-12-19-11-34-45-932.png!
!image-2022-12-19-11-35-03-652.png!
was:
在创建model时如果增量构建时指定了两个partition列,date 和 hour,构建没有问题,但在查询时如果where只指定 = 某个date值,查询结果非预期值。
--q1:
select pdate, phour, count(1) from lz_test_partition where pdate = '2022-12-19' group by pdate, phour
--q2:
select pdate, phour, count(1) from lz_test_partition group by pdate, phour
查看源码,bug 出现在 org.apache.spark.sql.execution.datasource.SegFilters#foldFilter 和 org.apache.spark.sql.execution.datasource.SegFilters#insurance 中,一处只用日期判断一处用到time级判断。
{code:java}
case class SegFilters(start: Long, end: Long, pattern: String) extends Logging {
private def insurance(value: Any)
(func: Long => Filter): Filter = {
value match {
case v: Date =>
// see SPARK-27546
val ts = DateFormat.stringToMillis(v.toString)
func(ts)
case v @ (_:String | _: Int | _: Long) if pattern != null =>
val format = DateFormat.getDateFormat(pattern)
val time = format.parse(v.toString).getTime
func(time)
case v: Timestamp =>
func(v.getTime)
case _ =>
Trivial(true)
}
}
/**
* Recursively fold provided filters to trivial,
* blocks are always non-empty.
*/
def foldFilter(filter: Filter): Filter = {
filter match {
case EqualTo(_, value: Any) =>
insurance(value) {
ts => Trivial(ts >= start && ts < end) --注意在这个地方是有问题的,ts 是date,但start 和 end 可以是到time级,因此在这里的过滤会丢
}
case In(_, values: Array[Any]) =>
val satisfied = values.map(v => insurance(v) {
ts => Trivial(ts >= start && ts < end)
}).exists(_.equals(Trivial(true)))
Trivial(satisfied)
case IsNull(_) =>
Trivial(false)
case IsNotNull(_) =>
Trivial(true)
case GreaterThan(_, value: Any) =>
insurance(value) {
ts => Trivial(ts < end)
}
case GreaterThanOrEqual(_, value: Any) =>
insurance(value) {
ts => Trivial(ts < end)
}
case LessThan(_, value: Any) =>
insurance(value) {
ts => Trivial(ts > start)
}
case LessThanOrEqual(_, value: Any) =>
insurance(value) {
ts => Trivial(ts >= start)
}
case And(left: Filter, right: Filter) =>
And(foldFilter(left), foldFilter(right)) match {
case And(AlwaysFalse, _) => Trivial(false)
case And(_, AlwaysFalse) => Trivial(false)
case And(AlwaysTrue, right) => right
case And(left, AlwaysTrue) => left
case other => other
}
case Or(left: Filter, right: Filter) =>
Or(foldFilter(left), foldFilter(right)) match {
case Or(AlwaysTrue, _) => Trivial(true)
case Or(_, AlwaysTrue) => Trivial(true)
case Or(AlwaysFalse, right) => right
case Or(left, AlwaysFalse) => left
case other => other
}
case unsupportedFilter =>
// return 'true' to scan all partitions
// currently unsupported filters are:
// - StringStartsWith
// - StringEndsWith
// - StringContains
// - EqualNullSafe
Trivial(true)
}
}
def Trivial(value: Boolean): Filter = {
if (value) AlwaysTrue else AlwaysFalse
}
}
{code}
详情及原因看附件图片:
!image-2022-12-19-11-33-36-654.png!
!image-2022-12-19-11-34-06-372.png!
!image-2022-12-19-11-34-45-932.png!
!image-2022-12-19-11-35-03-652.png!
> Kylin4 在多分区查询bug
> ----------------
>
> Key: KYLIN-5371
> URL: https://issues.apache.org/jira/browse/KYLIN-5371
> Project: Kylin
> Issue Type: Bug
> Affects Versions: v4.0.1, v4.0.2
> Reporter: Liu Zhao
> Priority: Major
> Attachments: image-2022-12-19-11-33-36-654.png, image-2022-12-19-11-34-06-372.png, image-2022-12-19-11-34-45-932.png, image-2022-12-19-11-35-03-652.png, image-2022-12-19-11-49-48-323.png
>
>
> 在创建model时如果增量构建时指定了两个partition列,date 和 hour,构建没有问题,但在查询时如果where只指定 = 某个date值,查询结果非预期值。
> // pdate, phour 都是分区列,在创建model时也指定为partition,详情见附件图片
> --q1:
> select pdate, phour, count(1) from lz_test_partition where pdate = '2022-12-19' group by pdate, phour
> --q2:
> select pdate, phour, count(1) from lz_test_partition group by pdate, phour
> 查看源码,bug 出现在 org.apache.spark.sql.execution.datasource.SegFilters#foldFilter 和 org.apache.spark.sql.execution.datasource.SegFilters#insurance 中,一处只用日期判断一处用到time级判断。
> {code:java}
> case class SegFilters(start: Long, end: Long, pattern: String) extends Logging {
> private def insurance(value: Any)
> (func: Long => Filter): Filter = {
> value match {
> case v: Date =>
> // see SPARK-27546
> val ts = DateFormat.stringToMillis(v.toString)
> func(ts)
> case v @ (_:String | _: Int | _: Long) if pattern != null =>
> val format = DateFormat.getDateFormat(pattern)
> val time = format.parse(v.toString).getTime
> func(time)
> case v: Timestamp =>
> func(v.getTime)
> case _ =>
> Trivial(true)
> }
> }
> /**
> * Recursively fold provided filters to trivial,
> * blocks are always non-empty.
> */
> def foldFilter(filter: Filter): Filter = {
> filter match {
> case EqualTo(_, value: Any) =>
> insurance(value) {
> ts => Trivial(ts >= start && ts < end) --注意在这个地方是有问题的,ts 是date,但start 和 end 可以是到time级,因此在这里的过滤会丢
> }
> case In(_, values: Array[Any]) =>
> val satisfied = values.map(v => insurance(v) {
> ts => Trivial(ts >= start && ts < end)
> }).exists(_.equals(Trivial(true)))
> Trivial(satisfied)
> case IsNull(_) =>
> Trivial(false)
> case IsNotNull(_) =>
> Trivial(true)
> case GreaterThan(_, value: Any) =>
> insurance(value) {
> ts => Trivial(ts < end)
> }
> case GreaterThanOrEqual(_, value: Any) =>
> insurance(value) {
> ts => Trivial(ts < end)
> }
> case LessThan(_, value: Any) =>
> insurance(value) {
> ts => Trivial(ts > start)
> }
> case LessThanOrEqual(_, value: Any) =>
> insurance(value) {
> ts => Trivial(ts >= start)
> }
> case And(left: Filter, right: Filter) =>
> And(foldFilter(left), foldFilter(right)) match {
> case And(AlwaysFalse, _) => Trivial(false)
> case And(_, AlwaysFalse) => Trivial(false)
> case And(AlwaysTrue, right) => right
> case And(left, AlwaysTrue) => left
> case other => other
> }
> case Or(left: Filter, right: Filter) =>
> Or(foldFilter(left), foldFilter(right)) match {
> case Or(AlwaysTrue, _) => Trivial(true)
> case Or(_, AlwaysTrue) => Trivial(true)
> case Or(AlwaysFalse, right) => right
> case Or(left, AlwaysFalse) => left
> case other => other
> }
> case unsupportedFilter =>
> // return 'true' to scan all partitions
> // currently unsupported filters are:
> // - StringStartsWith
> // - StringEndsWith
> // - StringContains
> // - EqualNullSafe
> Trivial(true)
> }
> }
> def Trivial(value: Boolean): Filter = {
> if (value) AlwaysTrue else AlwaysFalse
> }
> }
> {code}
> 详情及原因看附件图片:
> !image-2022-12-19-11-49-48-323.png!
> !image-2022-12-19-11-33-36-654.png!
> !image-2022-12-19-11-34-06-372.png!
> !image-2022-12-19-11-34-45-932.png!
> !image-2022-12-19-11-35-03-652.png!
--
This message was sent by Atlassian Jira
(v8.20.10#820010)