You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Yuming Wang (Jira)" <ji...@apache.org> on 2022/08/04 10:26:00 UTC

[jira] [Created] (SPARK-39978) Make filtered distinct count more accurate

Yuming Wang created SPARK-39978:
-----------------------------------

             Summary: Make filtered distinct count more accurate
                 Key: SPARK-39978
                 URL: https://issues.apache.org/jira/browse/SPARK-39978
             Project: Spark
          Issue Type: Improvement
          Components: SQL
    Affects Versions: 3.4.0
            Reporter: Yuming Wang


How to reproduce this issue:
{code:scala}
spark.sql("set spark.sql.cbo.enabled=true")
spark.sql("CREATE TABLE date_dim(d_month_seq int, d_year int, d_qoy int, d_moy int) USING parquet LOCATION 'file:/Users/yumwang/data/date_dim'")
spark.sql("analyze table date_dim compute statistics for all columns")
spark.sql("select * from date_dim where d_month_seq >= 1212 and d_month_seq <= 1223").explain("cost")
{code}

Output attribute stats:
{code:java}
case class Statistics(
    sizeInBytes: BigInt,
    rowCount: Option[BigInt] = None,
    attributeStats: AttributeMap[ColumnStat] = AttributeMap(Nil),
    isRuntime: Boolean = false) {

  override def toString: String = "Statistics(" + simpleString + ")"

  /** Readable string representation for the Statistics. */
  def simpleString: String = {
    Seq(s"sizeInBytes=${Utils.bytesToString(sizeInBytes)}",
      if (rowCount.isDefined) {
        // Show row count in scientific notation.
        s"rowCount=${BigDecimal(rowCount.get, new MathContext(3, RoundingMode.HALF_UP)).toString()}, attributeStats=${attributeStats}"
      } else {
        ""
      }
    ).filter(_.nonEmpty).mkString(", ")
  }
}
{code}






--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org