You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@kylin.apache.org by "Liu Zhao (Jira)" <ji...@apache.org> on 2022/12/19 03:52:00 UTC

[jira] [Updated] (KYLIN-5371) Kylin4 在多分区查询bug

     [ https://issues.apache.org/jira/browse/KYLIN-5371?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Liu Zhao updated KYLIN-5371:
----------------------------
     Attachment: image-2022-12-19-11-49-48-323.png
    Description: 
在创建model时如果增量构建时指定了两个partition列,date 和 hour,构建没有问题,但在查询时如果where只指定 = 某个date值,查询结果非预期值。

// pdate, phour 都是分区列,在创建model时也指定为partition,详情见附件图片

--q1:
select pdate, phour, count(1) from lz_test_partition where pdate = '2022-12-19' group by pdate, phour

--q2:
select pdate, phour, count(1) from lz_test_partition group by pdate, phour



查看源码,bug 出现在 org.apache.spark.sql.execution.datasource.SegFilters#foldFilter 和 org.apache.spark.sql.execution.datasource.SegFilters#insurance 中,一处只用日期判断一处用到time级判断。



{code:java}
case class SegFilters(start: Long, end: Long, pattern: String) extends Logging {

  private def insurance(value: Any)
                       (func: Long => Filter): Filter = {
    value match {
      case v: Date =>
        // see SPARK-27546
        val ts = DateFormat.stringToMillis(v.toString)
        func(ts)
      case v @ (_:String | _: Int | _: Long) if pattern != null =>
        val format = DateFormat.getDateFormat(pattern)
        val time = format.parse(v.toString).getTime
        func(time)
      case v: Timestamp =>
        func(v.getTime)
      case _ =>
        Trivial(true)
    }
  }

  /**
   * Recursively fold provided filters to trivial,
   * blocks are always non-empty.
   */
  def foldFilter(filter: Filter): Filter = {
    filter match {
      case EqualTo(_, value: Any) =>
        insurance(value) {
          ts => Trivial(ts >= start && ts < end)    --注意在这个地方是有问题的,ts 是date,但start 和 end 可以是到time级,因此在这里的过滤会丢
        }
      case In(_, values: Array[Any]) =>
        val satisfied = values.map(v => insurance(v) {
          ts => Trivial(ts >= start && ts < end)
        }).exists(_.equals(Trivial(true)))
        Trivial(satisfied)

      case IsNull(_) =>
        Trivial(false)
      case IsNotNull(_) =>
        Trivial(true)
      case GreaterThan(_, value: Any) =>
        insurance(value) {
          ts => Trivial(ts < end)
        }
      case GreaterThanOrEqual(_, value: Any) =>
        insurance(value) {
          ts => Trivial(ts < end)
        }
      case LessThan(_, value: Any) =>
        insurance(value) {
          ts => Trivial(ts > start)
        }
      case LessThanOrEqual(_, value: Any) =>
        insurance(value) {
          ts => Trivial(ts >= start)
        }
      case And(left: Filter, right: Filter) =>
        And(foldFilter(left), foldFilter(right)) match {
          case And(AlwaysFalse, _) => Trivial(false)
          case And(_, AlwaysFalse) => Trivial(false)
          case And(AlwaysTrue, right) => right
          case And(left, AlwaysTrue) => left
          case other => other
        }
      case Or(left: Filter, right: Filter) =>
        Or(foldFilter(left), foldFilter(right)) match {
          case Or(AlwaysTrue, _) => Trivial(true)
          case Or(_, AlwaysTrue) => Trivial(true)
          case Or(AlwaysFalse, right) => right
          case Or(left, AlwaysFalse) => left
          case other => other
        }
      case unsupportedFilter =>
        // return 'true' to scan all partitions
        // currently unsupported filters are:
        // - StringStartsWith
        // - StringEndsWith
        // - StringContains
        // - EqualNullSafe
        Trivial(true)
    }
  }
  def Trivial(value: Boolean): Filter = {
    if (value) AlwaysTrue else AlwaysFalse
  }
}
{code}


详情及原因看附件图片:
 !image-2022-12-19-11-49-48-323.png! 
 !image-2022-12-19-11-33-36-654.png! 
 !image-2022-12-19-11-34-06-372.png! 
 !image-2022-12-19-11-34-45-932.png! 
 !image-2022-12-19-11-35-03-652.png! 

  was:
在创建model时如果增量构建时指定了两个partition列,date 和 hour,构建没有问题,但在查询时如果where只指定 = 某个date值,查询结果非预期值。

--q1:
select pdate, phour, count(1) from lz_test_partition where pdate = '2022-12-19' group by pdate, phour

--q2:
select pdate, phour, count(1) from lz_test_partition group by pdate, phour



查看源码,bug 出现在 org.apache.spark.sql.execution.datasource.SegFilters#foldFilter 和 org.apache.spark.sql.execution.datasource.SegFilters#insurance 中,一处只用日期判断一处用到time级判断。



{code:java}
case class SegFilters(start: Long, end: Long, pattern: String) extends Logging {

  private def insurance(value: Any)
                       (func: Long => Filter): Filter = {
    value match {
      case v: Date =>
        // see SPARK-27546
        val ts = DateFormat.stringToMillis(v.toString)
        func(ts)
      case v @ (_:String | _: Int | _: Long) if pattern != null =>
        val format = DateFormat.getDateFormat(pattern)
        val time = format.parse(v.toString).getTime
        func(time)
      case v: Timestamp =>
        func(v.getTime)
      case _ =>
        Trivial(true)
    }
  }

  /**
   * Recursively fold provided filters to trivial,
   * blocks are always non-empty.
   */
  def foldFilter(filter: Filter): Filter = {
    filter match {
      case EqualTo(_, value: Any) =>
        insurance(value) {
          ts => Trivial(ts >= start && ts < end)    --注意在这个地方是有问题的,ts 是date,但start 和 end 可以是到time级,因此在这里的过滤会丢
        }
      case In(_, values: Array[Any]) =>
        val satisfied = values.map(v => insurance(v) {
          ts => Trivial(ts >= start && ts < end)
        }).exists(_.equals(Trivial(true)))
        Trivial(satisfied)

      case IsNull(_) =>
        Trivial(false)
      case IsNotNull(_) =>
        Trivial(true)
      case GreaterThan(_, value: Any) =>
        insurance(value) {
          ts => Trivial(ts < end)
        }
      case GreaterThanOrEqual(_, value: Any) =>
        insurance(value) {
          ts => Trivial(ts < end)
        }
      case LessThan(_, value: Any) =>
        insurance(value) {
          ts => Trivial(ts > start)
        }
      case LessThanOrEqual(_, value: Any) =>
        insurance(value) {
          ts => Trivial(ts >= start)
        }
      case And(left: Filter, right: Filter) =>
        And(foldFilter(left), foldFilter(right)) match {
          case And(AlwaysFalse, _) => Trivial(false)
          case And(_, AlwaysFalse) => Trivial(false)
          case And(AlwaysTrue, right) => right
          case And(left, AlwaysTrue) => left
          case other => other
        }
      case Or(left: Filter, right: Filter) =>
        Or(foldFilter(left), foldFilter(right)) match {
          case Or(AlwaysTrue, _) => Trivial(true)
          case Or(_, AlwaysTrue) => Trivial(true)
          case Or(AlwaysFalse, right) => right
          case Or(left, AlwaysFalse) => left
          case other => other
        }
      case unsupportedFilter =>
        // return 'true' to scan all partitions
        // currently unsupported filters are:
        // - StringStartsWith
        // - StringEndsWith
        // - StringContains
        // - EqualNullSafe
        Trivial(true)
    }
  }
  def Trivial(value: Boolean): Filter = {
    if (value) AlwaysTrue else AlwaysFalse
  }
}
{code}


详情及原因看附件图片:
 !image-2022-12-19-11-33-36-654.png! 
 !image-2022-12-19-11-34-06-372.png! 
 !image-2022-12-19-11-34-45-932.png! 
 !image-2022-12-19-11-35-03-652.png! 


> Kylin4 在多分区查询bug
> ----------------
>
>                 Key: KYLIN-5371
>                 URL: https://issues.apache.org/jira/browse/KYLIN-5371
>             Project: Kylin
>          Issue Type: Bug
>    Affects Versions: v4.0.1, v4.0.2
>            Reporter: Liu Zhao
>            Priority: Major
>         Attachments: image-2022-12-19-11-33-36-654.png, image-2022-12-19-11-34-06-372.png, image-2022-12-19-11-34-45-932.png, image-2022-12-19-11-35-03-652.png, image-2022-12-19-11-49-48-323.png
>
>
> 在创建model时如果增量构建时指定了两个partition列,date 和 hour,构建没有问题,但在查询时如果where只指定 = 某个date值,查询结果非预期值。
> // pdate, phour 都是分区列,在创建model时也指定为partition,详情见附件图片
> --q1:
> select pdate, phour, count(1) from lz_test_partition where pdate = '2022-12-19' group by pdate, phour
> --q2:
> select pdate, phour, count(1) from lz_test_partition group by pdate, phour
> 查看源码,bug 出现在 org.apache.spark.sql.execution.datasource.SegFilters#foldFilter 和 org.apache.spark.sql.execution.datasource.SegFilters#insurance 中,一处只用日期判断一处用到time级判断。
> {code:java}
> case class SegFilters(start: Long, end: Long, pattern: String) extends Logging {
>   private def insurance(value: Any)
>                        (func: Long => Filter): Filter = {
>     value match {
>       case v: Date =>
>         // see SPARK-27546
>         val ts = DateFormat.stringToMillis(v.toString)
>         func(ts)
>       case v @ (_:String | _: Int | _: Long) if pattern != null =>
>         val format = DateFormat.getDateFormat(pattern)
>         val time = format.parse(v.toString).getTime
>         func(time)
>       case v: Timestamp =>
>         func(v.getTime)
>       case _ =>
>         Trivial(true)
>     }
>   }
>   /**
>    * Recursively fold provided filters to trivial,
>    * blocks are always non-empty.
>    */
>   def foldFilter(filter: Filter): Filter = {
>     filter match {
>       case EqualTo(_, value: Any) =>
>         insurance(value) {
>           ts => Trivial(ts >= start && ts < end)    --注意在这个地方是有问题的,ts 是date,但start 和 end 可以是到time级,因此在这里的过滤会丢
>         }
>       case In(_, values: Array[Any]) =>
>         val satisfied = values.map(v => insurance(v) {
>           ts => Trivial(ts >= start && ts < end)
>         }).exists(_.equals(Trivial(true)))
>         Trivial(satisfied)
>       case IsNull(_) =>
>         Trivial(false)
>       case IsNotNull(_) =>
>         Trivial(true)
>       case GreaterThan(_, value: Any) =>
>         insurance(value) {
>           ts => Trivial(ts < end)
>         }
>       case GreaterThanOrEqual(_, value: Any) =>
>         insurance(value) {
>           ts => Trivial(ts < end)
>         }
>       case LessThan(_, value: Any) =>
>         insurance(value) {
>           ts => Trivial(ts > start)
>         }
>       case LessThanOrEqual(_, value: Any) =>
>         insurance(value) {
>           ts => Trivial(ts >= start)
>         }
>       case And(left: Filter, right: Filter) =>
>         And(foldFilter(left), foldFilter(right)) match {
>           case And(AlwaysFalse, _) => Trivial(false)
>           case And(_, AlwaysFalse) => Trivial(false)
>           case And(AlwaysTrue, right) => right
>           case And(left, AlwaysTrue) => left
>           case other => other
>         }
>       case Or(left: Filter, right: Filter) =>
>         Or(foldFilter(left), foldFilter(right)) match {
>           case Or(AlwaysTrue, _) => Trivial(true)
>           case Or(_, AlwaysTrue) => Trivial(true)
>           case Or(AlwaysFalse, right) => right
>           case Or(left, AlwaysFalse) => left
>           case other => other
>         }
>       case unsupportedFilter =>
>         // return 'true' to scan all partitions
>         // currently unsupported filters are:
>         // - StringStartsWith
>         // - StringEndsWith
>         // - StringContains
>         // - EqualNullSafe
>         Trivial(true)
>     }
>   }
>   def Trivial(value: Boolean): Filter = {
>     if (value) AlwaysTrue else AlwaysFalse
>   }
> }
> {code}
> 详情及原因看附件图片:
>  !image-2022-12-19-11-49-48-323.png! 
>  !image-2022-12-19-11-33-36-654.png! 
>  !image-2022-12-19-11-34-06-372.png! 
>  !image-2022-12-19-11-34-45-932.png! 
>  !image-2022-12-19-11-35-03-652.png! 



--
This message was sent by Atlassian Jira
(v8.20.10#820010)