You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by kiszk <gi...@git.apache.org> on 2017/05/16 15:58:42 UTC

[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

GitHub user kiszk opened a pull request:

    https://github.com/apache/spark/pull/18002

    [SPARK-20770][SQL] Improve ColumnStats

    ## What changes were proposed in this pull request?
    
    This PR improves the implementation of `ColumnStats` by using the following appoaches.
    
    1. Declare subclasses of `ColumnStats` as `final` 
    2. Remove unnecessary call of `row.isNullAt(ordinal)` 
    3. Remove the dependency on `GenericInternalRow`
    
    For 1., this declaration encourages method inlining and other optimizations of JIT compiler
    For 2., in `gatherStats()`, while previous code in subclasses of `ColumnStats` always calls `row.isNullAt()` twice, the PR just calls `row.isNullAt()` only once.
    For 3., `collectedStatistics()` returns `Array[Any]` instead of `GenericInternalRow`. This removes the dependency of unnecessary package and reduces the number of allocations of `GenericInternalRow`.
    
    In addition to that, in the future, `gatherValueStats()`, which is specialized for each data type, can be effectively called from the generated code without using generic data structure `InternalRow`.
    
    ## How was this patch tested?
    
    Tested by existing test suite

You can merge this pull request into a Git repository by running:

    $ git pull https://github.com/kiszk/spark SPARK-20770

Alternatively you can review and apply these changes as the patch at:

    https://github.com/apache/spark/pull/18002.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

    This closes #18002
    
----

----


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by cloud-fan <gi...@git.apache.org>.
Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r117153431
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala ---
    @@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends Serializable {
       /**
        * Gathers statistics information from `row(ordinal)`.
        */
    -  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    if (row.isNullAt(ordinal)) {
    -      nullCount += 1
    -      // 4 bytes for null position
    -      sizeInBytes += 4
    -    }
    +  def gatherStats(row: InternalRow, ordinal: Int): Unit
    +
    +  /**
    +   * Gathers statistics information on `null`.
    +   */
    +  def gatherNullStats(): Unit = {
    +    nullCount += 1
    +    // 4 bytes for null position
    +    sizeInBytes += 4
         count += 1
       }
     
       /**
    -   * Column statistics represented as a single row, currently including closed lower bound, closed
    +   * Column statistics represented as an array, currently including closed lower bound, closed
        * upper bound and null count.
        */
    -  def collectedStatistics: GenericInternalRow
    +  def collectedStatistics: Array[Any]
     }
     
     /**
      * A no-op ColumnStats only used for testing purposes.
      */
    -private[columnar] class NoopColumnStats extends ColumnStats {
    -  override def gatherStats(row: InternalRow, ordinal: Int): Unit = super.gatherStats(row, ordinal)
    +private[columnar] final class NoopColumnStats extends ColumnStats {
    +  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    +    if (!row.isNullAt(ordinal)) {
    +      count += 1
    +    } else {
    +      gatherNullStats
    +    }
    +  }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
    +  override def collectedStatistics: Array[Any] = Array[Any](null, null, nullCount, count, 0L)
     }
     
    -private[columnar] class BooleanColumnStats extends ColumnStats {
    +private[columnar] final class BooleanColumnStats extends ColumnStats {
       protected var upper = false
       protected var lower = true
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getBoolean(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BOOLEAN.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Boolean): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BOOLEAN.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ByteColumnStats extends ColumnStats {
    +private[columnar] final class ByteColumnStats extends ColumnStats {
       protected var upper = Byte.MinValue
       protected var lower = Byte.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getByte(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BYTE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Byte): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BYTE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ShortColumnStats extends ColumnStats {
    +private[columnar] final class ShortColumnStats extends ColumnStats {
       protected var upper = Short.MinValue
       protected var lower = Short.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getShort(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += SHORT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Short): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += SHORT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class IntColumnStats extends ColumnStats {
    +private[columnar] final class IntColumnStats extends ColumnStats {
       protected var upper = Int.MinValue
       protected var lower = Int.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getInt(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += INT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Int): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += INT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class LongColumnStats extends ColumnStats {
    +private[columnar] final class LongColumnStats extends ColumnStats {
       protected var upper = Long.MinValue
       protected var lower = Long.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getLong(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += LONG.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Long): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += LONG.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class FloatColumnStats extends ColumnStats {
    +private[columnar] final class FloatColumnStats extends ColumnStats {
       protected var upper = Float.MinValue
       protected var lower = Float.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getFloat(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += FLOAT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Float): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += FLOAT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class DoubleColumnStats extends ColumnStats {
    +private[columnar] final class DoubleColumnStats extends ColumnStats {
       protected var upper = Double.MinValue
       protected var lower = Double.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getDouble(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += DOUBLE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Double): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += DOUBLE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class StringColumnStats extends ColumnStats {
    +private[columnar] final class StringColumnStats extends ColumnStats {
       protected var upper: UTF8String = null
       protected var lower: UTF8String = null
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getUTF8String(ordinal)
    -      if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    -      if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    -      sizeInBytes += STRING.actualSize(row, ordinal)
    +      val size = STRING.actualSize(row, ordinal)
    --- End diff --
    
    not related, but `STRING.actualSize` should just take `UTF8String`


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by cloud-fan <gi...@git.apache.org>.
Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r117153480
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala ---
    @@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends Serializable {
       /**
        * Gathers statistics information from `row(ordinal)`.
        */
    -  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    if (row.isNullAt(ordinal)) {
    -      nullCount += 1
    -      // 4 bytes for null position
    -      sizeInBytes += 4
    -    }
    +  def gatherStats(row: InternalRow, ordinal: Int): Unit
    +
    +  /**
    +   * Gathers statistics information on `null`.
    +   */
    +  def gatherNullStats(): Unit = {
    +    nullCount += 1
    +    // 4 bytes for null position
    +    sizeInBytes += 4
         count += 1
       }
     
       /**
    -   * Column statistics represented as a single row, currently including closed lower bound, closed
    +   * Column statistics represented as an array, currently including closed lower bound, closed
        * upper bound and null count.
        */
    -  def collectedStatistics: GenericInternalRow
    +  def collectedStatistics: Array[Any]
     }
     
     /**
      * A no-op ColumnStats only used for testing purposes.
      */
    -private[columnar] class NoopColumnStats extends ColumnStats {
    -  override def gatherStats(row: InternalRow, ordinal: Int): Unit = super.gatherStats(row, ordinal)
    +private[columnar] final class NoopColumnStats extends ColumnStats {
    +  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    +    if (!row.isNullAt(ordinal)) {
    +      count += 1
    +    } else {
    +      gatherNullStats
    +    }
    +  }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
    +  override def collectedStatistics: Array[Any] = Array[Any](null, null, nullCount, count, 0L)
     }
     
    -private[columnar] class BooleanColumnStats extends ColumnStats {
    +private[columnar] final class BooleanColumnStats extends ColumnStats {
       protected var upper = false
       protected var lower = true
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getBoolean(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BOOLEAN.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Boolean): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BOOLEAN.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ByteColumnStats extends ColumnStats {
    +private[columnar] final class ByteColumnStats extends ColumnStats {
       protected var upper = Byte.MinValue
       protected var lower = Byte.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getByte(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BYTE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Byte): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BYTE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ShortColumnStats extends ColumnStats {
    +private[columnar] final class ShortColumnStats extends ColumnStats {
       protected var upper = Short.MinValue
       protected var lower = Short.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getShort(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += SHORT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Short): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += SHORT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class IntColumnStats extends ColumnStats {
    +private[columnar] final class IntColumnStats extends ColumnStats {
       protected var upper = Int.MinValue
       protected var lower = Int.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getInt(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += INT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Int): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += INT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class LongColumnStats extends ColumnStats {
    +private[columnar] final class LongColumnStats extends ColumnStats {
       protected var upper = Long.MinValue
       protected var lower = Long.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getLong(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += LONG.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Long): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += LONG.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class FloatColumnStats extends ColumnStats {
    +private[columnar] final class FloatColumnStats extends ColumnStats {
       protected var upper = Float.MinValue
       protected var lower = Float.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getFloat(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += FLOAT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Float): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += FLOAT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class DoubleColumnStats extends ColumnStats {
    +private[columnar] final class DoubleColumnStats extends ColumnStats {
       protected var upper = Double.MinValue
       protected var lower = Double.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getDouble(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += DOUBLE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Double): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += DOUBLE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class StringColumnStats extends ColumnStats {
    +private[columnar] final class StringColumnStats extends ColumnStats {
       protected var upper: UTF8String = null
       protected var lower: UTF8String = null
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getUTF8String(ordinal)
    -      if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    -      if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    -      sizeInBytes += STRING.actualSize(row, ordinal)
    +      val size = STRING.actualSize(row, ordinal)
    +      gatherValueStats(value, size)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: UTF8String, size: Int): Unit = {
    +    if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    +    if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    +    sizeInBytes += size
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class BinaryColumnStats extends ColumnStats {
    +private[columnar] final class BinaryColumnStats extends ColumnStats {
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
    -      sizeInBytes += BINARY.actualSize(row, ordinal)
    +      val size = BINARY.actualSize(row, ordinal)
    +      sizeInBytes += size
    +      count += 1
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, sizeInBytes))
    +  def gatherValueStats(size: Int): Unit = {
    --- End diff --
    
    is this method used?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by AmplabJenkins <gi...@git.apache.org>.
Github user AmplabJenkins commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    Merged build finished. Test PASSed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by SparkQA <gi...@git.apache.org>.
Github user SparkQA commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    **[Test build #77029 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/77029/testReport)** for PR 18002 at commit [`3e3ffde`](https://github.com/apache/spark/commit/3e3ffde9c566e54474fad0c7508544606ddddef3).
     * This patch passes all tests.
     * This patch merges cleanly.
     * This patch adds no public classes.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by cloud-fan <gi...@git.apache.org>.
Github user cloud-fan commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    thanks, merging to master!


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by cloud-fan <gi...@git.apache.org>.
Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r117174531
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala ---
    @@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends Serializable {
       /**
        * Gathers statistics information from `row(ordinal)`.
        */
    -  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    if (row.isNullAt(ordinal)) {
    -      nullCount += 1
    -      // 4 bytes for null position
    -      sizeInBytes += 4
    -    }
    +  def gatherStats(row: InternalRow, ordinal: Int): Unit
    +
    +  /**
    +   * Gathers statistics information on `null`.
    +   */
    +  def gatherNullStats(): Unit = {
    +    nullCount += 1
    +    // 4 bytes for null position
    +    sizeInBytes += 4
         count += 1
       }
     
       /**
    -   * Column statistics represented as a single row, currently including closed lower bound, closed
    +   * Column statistics represented as an array, currently including closed lower bound, closed
        * upper bound and null count.
        */
    -  def collectedStatistics: GenericInternalRow
    +  def collectedStatistics: Array[Any]
     }
     
     /**
      * A no-op ColumnStats only used for testing purposes.
      */
    -private[columnar] class NoopColumnStats extends ColumnStats {
    -  override def gatherStats(row: InternalRow, ordinal: Int): Unit = super.gatherStats(row, ordinal)
    +private[columnar] final class NoopColumnStats extends ColumnStats {
    +  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    +    if (!row.isNullAt(ordinal)) {
    +      count += 1
    +    } else {
    +      gatherNullStats
    +    }
    +  }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
    +  override def collectedStatistics: Array[Any] = Array[Any](null, null, nullCount, count, 0L)
     }
     
    -private[columnar] class BooleanColumnStats extends ColumnStats {
    +private[columnar] final class BooleanColumnStats extends ColumnStats {
       protected var upper = false
       protected var lower = true
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getBoolean(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BOOLEAN.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Boolean): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BOOLEAN.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ByteColumnStats extends ColumnStats {
    +private[columnar] final class ByteColumnStats extends ColumnStats {
       protected var upper = Byte.MinValue
       protected var lower = Byte.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getByte(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BYTE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Byte): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BYTE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ShortColumnStats extends ColumnStats {
    +private[columnar] final class ShortColumnStats extends ColumnStats {
       protected var upper = Short.MinValue
       protected var lower = Short.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getShort(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += SHORT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Short): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += SHORT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class IntColumnStats extends ColumnStats {
    +private[columnar] final class IntColumnStats extends ColumnStats {
       protected var upper = Int.MinValue
       protected var lower = Int.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getInt(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += INT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Int): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += INT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class LongColumnStats extends ColumnStats {
    +private[columnar] final class LongColumnStats extends ColumnStats {
       protected var upper = Long.MinValue
       protected var lower = Long.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getLong(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += LONG.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Long): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += LONG.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class FloatColumnStats extends ColumnStats {
    +private[columnar] final class FloatColumnStats extends ColumnStats {
       protected var upper = Float.MinValue
       protected var lower = Float.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getFloat(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += FLOAT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Float): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += FLOAT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class DoubleColumnStats extends ColumnStats {
    +private[columnar] final class DoubleColumnStats extends ColumnStats {
       protected var upper = Double.MinValue
       protected var lower = Double.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getDouble(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += DOUBLE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Double): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += DOUBLE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class StringColumnStats extends ColumnStats {
    +private[columnar] final class StringColumnStats extends ColumnStats {
       protected var upper: UTF8String = null
       protected var lower: UTF8String = null
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getUTF8String(ordinal)
    -      if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    -      if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    -      sizeInBytes += STRING.actualSize(row, ordinal)
    +      val size = STRING.actualSize(row, ordinal)
    --- End diff --
    
    In `STRING.actualSize`, we call `row.getUTF8String(ordinal)`, so why not we pass in the UTF8String directly?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by cloud-fan <gi...@git.apache.org>.
Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r117299631
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala ---
    @@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends Serializable {
       /**
        * Gathers statistics information from `row(ordinal)`.
        */
    -  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    if (row.isNullAt(ordinal)) {
    -      nullCount += 1
    -      // 4 bytes for null position
    -      sizeInBytes += 4
    -    }
    +  def gatherStats(row: InternalRow, ordinal: Int): Unit
    +
    +  /**
    +   * Gathers statistics information on `null`.
    +   */
    +  def gatherNullStats(): Unit = {
    +    nullCount += 1
    +    // 4 bytes for null position
    +    sizeInBytes += 4
         count += 1
       }
     
       /**
    -   * Column statistics represented as a single row, currently including closed lower bound, closed
    +   * Column statistics represented as an array, currently including closed lower bound, closed
        * upper bound and null count.
        */
    -  def collectedStatistics: GenericInternalRow
    +  def collectedStatistics: Array[Any]
     }
     
     /**
      * A no-op ColumnStats only used for testing purposes.
      */
    -private[columnar] class NoopColumnStats extends ColumnStats {
    -  override def gatherStats(row: InternalRow, ordinal: Int): Unit = super.gatherStats(row, ordinal)
    +private[columnar] final class NoopColumnStats extends ColumnStats {
    +  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    +    if (!row.isNullAt(ordinal)) {
    +      count += 1
    +    } else {
    +      gatherNullStats
    +    }
    +  }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
    +  override def collectedStatistics: Array[Any] = Array[Any](null, null, nullCount, count, 0L)
     }
     
    -private[columnar] class BooleanColumnStats extends ColumnStats {
    +private[columnar] final class BooleanColumnStats extends ColumnStats {
       protected var upper = false
       protected var lower = true
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getBoolean(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BOOLEAN.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Boolean): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BOOLEAN.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ByteColumnStats extends ColumnStats {
    +private[columnar] final class ByteColumnStats extends ColumnStats {
       protected var upper = Byte.MinValue
       protected var lower = Byte.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getByte(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BYTE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Byte): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BYTE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ShortColumnStats extends ColumnStats {
    +private[columnar] final class ShortColumnStats extends ColumnStats {
       protected var upper = Short.MinValue
       protected var lower = Short.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getShort(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += SHORT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Short): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += SHORT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class IntColumnStats extends ColumnStats {
    +private[columnar] final class IntColumnStats extends ColumnStats {
       protected var upper = Int.MinValue
       protected var lower = Int.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getInt(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += INT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Int): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += INT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class LongColumnStats extends ColumnStats {
    +private[columnar] final class LongColumnStats extends ColumnStats {
       protected var upper = Long.MinValue
       protected var lower = Long.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getLong(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += LONG.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Long): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += LONG.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class FloatColumnStats extends ColumnStats {
    +private[columnar] final class FloatColumnStats extends ColumnStats {
       protected var upper = Float.MinValue
       protected var lower = Float.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getFloat(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += FLOAT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Float): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += FLOAT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class DoubleColumnStats extends ColumnStats {
    +private[columnar] final class DoubleColumnStats extends ColumnStats {
       protected var upper = Double.MinValue
       protected var lower = Double.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getDouble(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += DOUBLE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Double): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += DOUBLE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class StringColumnStats extends ColumnStats {
    +private[columnar] final class StringColumnStats extends ColumnStats {
       protected var upper: UTF8String = null
       protected var lower: UTF8String = null
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getUTF8String(ordinal)
    -      if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    -      if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    -      sizeInBytes += STRING.actualSize(row, ordinal)
    +      val size = STRING.actualSize(row, ordinal)
    --- End diff --
    
    ah i see, nvm


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by SparkQA <gi...@git.apache.org>.
Github user SparkQA commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    **[Test build #77050 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/77050/testReport)** for PR 18002 at commit [`66fefb6`](https://github.com/apache/spark/commit/66fefb6a8a0ddf1d1dc9318e5320506987da9c59).


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by asfgit <gi...@git.apache.org>.
Github user asfgit closed the pull request at:

    https://github.com/apache/spark/pull/18002


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by kiszk <gi...@git.apache.org>.
Github user kiszk commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r117093783
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala ---
    @@ -53,219 +53,297 @@ private[columnar] sealed trait ColumnStats extends Serializable {
       /**
        * Gathers statistics information from `row(ordinal)`.
        */
    -  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    if (row.isNullAt(ordinal)) {
    -      nullCount += 1
    -      // 4 bytes for null position
    -      sizeInBytes += 4
    -    }
    +  def gatherStats(row: InternalRow, ordinal: Int): Unit
    +
    +  /**
    +   * Gathers statistics information on `null`.
    +   */
    +  def gatherNullStats(): Unit = {
    +    nullCount += 1
    +    // 4 bytes for null position
    +    sizeInBytes += 4
         count += 1
       }
     
       /**
    -   * Column statistics represented as a single row, currently including closed lower bound, closed
    +   * Column statistics represented as an array, currently including closed lower bound, closed
        * upper bound and null count.
        */
    -  def collectedStatistics: GenericInternalRow
    +  def collectedStatistics: Array[Any]
     }
     
     /**
      * A no-op ColumnStats only used for testing purposes.
      */
    -private[columnar] class NoopColumnStats extends ColumnStats {
    -  override def gatherStats(row: InternalRow, ordinal: Int): Unit = super.gatherStats(row, ordinal)
    +private[columnar] final class NoopColumnStats extends ColumnStats {
    +  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    +    if (!row.isNullAt(ordinal)) {
    +      count += 1
    +    } else {
    +      super.gatherNullStats
    --- End diff --
    
    Good catch. done.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by kiszk <gi...@git.apache.org>.
Github user kiszk commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r117093744
  
    --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala ---
    @@ -33,18 +32,18 @@ class ColumnStatsSuite extends SparkFunSuite {
       testColumnStats(classOf[StringColumnStats], STRING, createRow(null, null, 0))
       testDecimalColumnStats(createRow(null, null, 0))
     
    -  def createRow(values: Any*): GenericInternalRow = new GenericInternalRow(values.toArray)
    +  def createRow(values: Any*): Array[Any] = values.toArray
    --- End diff --
    
    I see. Eliminated.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by kiszk <gi...@git.apache.org>.
Github user kiszk commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r117093688
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala ---
    @@ -53,219 +53,297 @@ private[columnar] sealed trait ColumnStats extends Serializable {
       /**
        * Gathers statistics information from `row(ordinal)`.
        */
    -  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    if (row.isNullAt(ordinal)) {
    -      nullCount += 1
    -      // 4 bytes for null position
    -      sizeInBytes += 4
    -    }
    +  def gatherStats(row: InternalRow, ordinal: Int): Unit
    +
    +  /**
    +   * Gathers statistics information on `null`.
    +   */
    +  def gatherNullStats(): Unit = {
    +    nullCount += 1
    +    // 4 bytes for null position
    +    sizeInBytes += 4
         count += 1
       }
     
       /**
    -   * Column statistics represented as a single row, currently including closed lower bound, closed
    +   * Column statistics represented as an array, currently including closed lower bound, closed
        * upper bound and null count.
        */
    -  def collectedStatistics: GenericInternalRow
    +  def collectedStatistics: Array[Any]
     }
     
     /**
      * A no-op ColumnStats only used for testing purposes.
      */
    -private[columnar] class NoopColumnStats extends ColumnStats {
    -  override def gatherStats(row: InternalRow, ordinal: Int): Unit = super.gatherStats(row, ordinal)
    +private[columnar] final class NoopColumnStats extends ColumnStats {
    +  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    +    if (!row.isNullAt(ordinal)) {
    +      count += 1
    +    } else {
    +      super.gatherNullStats
    +    }
    +  }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
    +  override def collectedStatistics: Array[Any] = Array[Any](null, null, nullCount, count, 0L)
     }
     
    -private[columnar] class BooleanColumnStats extends ColumnStats {
    +private[columnar] final class BooleanColumnStats extends ColumnStats {
       protected var upper = false
       protected var lower = true
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getBoolean(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BOOLEAN.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Boolean): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BOOLEAN.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ByteColumnStats extends ColumnStats {
    +private[columnar] final class ByteColumnStats extends ColumnStats {
       protected var upper = Byte.MinValue
       protected var lower = Byte.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getByte(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BYTE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Byte): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BYTE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ShortColumnStats extends ColumnStats {
    +private[columnar] final class ShortColumnStats extends ColumnStats {
       protected var upper = Short.MinValue
       protected var lower = Short.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getShort(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += SHORT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Short): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += SHORT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class IntColumnStats extends ColumnStats {
    +private[columnar] final class IntColumnStats extends ColumnStats {
       protected var upper = Int.MinValue
       protected var lower = Int.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getInt(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += INT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Int): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += INT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class LongColumnStats extends ColumnStats {
    +private[columnar] final class LongColumnStats extends ColumnStats {
       protected var upper = Long.MinValue
       protected var lower = Long.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getLong(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += LONG.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Long): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += LONG.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class FloatColumnStats extends ColumnStats {
    +private[columnar] final class FloatColumnStats extends ColumnStats {
       protected var upper = Float.MinValue
       protected var lower = Float.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getFloat(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += FLOAT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Float): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += FLOAT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class DoubleColumnStats extends ColumnStats {
    +private[columnar] final class DoubleColumnStats extends ColumnStats {
       protected var upper = Double.MinValue
       protected var lower = Double.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getDouble(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += DOUBLE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Double): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += DOUBLE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class StringColumnStats extends ColumnStats {
    +private[columnar] final class StringColumnStats extends ColumnStats {
       protected var upper: UTF8String = null
       protected var lower: UTF8String = null
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getUTF8String(ordinal)
    -      if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    -      if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    -      sizeInBytes += STRING.actualSize(row, ordinal)
    +      val size = STRING.actualSize(row, ordinal)
    +      gatherValueStats(value, size)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: UTF8String, size: Int): Unit = {
    +    if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    +    if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    +    sizeInBytes += size
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class BinaryColumnStats extends ColumnStats {
    +private[columnar] final class BinaryColumnStats extends ColumnStats {
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
    -      sizeInBytes += BINARY.actualSize(row, ordinal)
    +      val size = BINARY.actualSize(row, ordinal)
    +      gatherValueStats(size)
    --- End diff --
    
    Thanks, done.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by AmplabJenkins <gi...@git.apache.org>.
Github user AmplabJenkins commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    Merged build finished. Test PASSed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by kiszk <gi...@git.apache.org>.
Github user kiszk commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    @cloud-fan and @viirya, thank you for good comments. I am looking forward to merging it into master.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by kiszk <gi...@git.apache.org>.
Github user kiszk commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r117177275
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala ---
    @@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends Serializable {
       /**
        * Gathers statistics information from `row(ordinal)`.
        */
    -  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    if (row.isNullAt(ordinal)) {
    -      nullCount += 1
    -      // 4 bytes for null position
    -      sizeInBytes += 4
    -    }
    +  def gatherStats(row: InternalRow, ordinal: Int): Unit
    +
    +  /**
    +   * Gathers statistics information on `null`.
    +   */
    +  def gatherNullStats(): Unit = {
    +    nullCount += 1
    +    // 4 bytes for null position
    +    sizeInBytes += 4
         count += 1
       }
     
       /**
    -   * Column statistics represented as a single row, currently including closed lower bound, closed
    +   * Column statistics represented as an array, currently including closed lower bound, closed
        * upper bound and null count.
        */
    -  def collectedStatistics: GenericInternalRow
    +  def collectedStatistics: Array[Any]
     }
     
     /**
      * A no-op ColumnStats only used for testing purposes.
      */
    -private[columnar] class NoopColumnStats extends ColumnStats {
    -  override def gatherStats(row: InternalRow, ordinal: Int): Unit = super.gatherStats(row, ordinal)
    +private[columnar] final class NoopColumnStats extends ColumnStats {
    +  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    +    if (!row.isNullAt(ordinal)) {
    +      count += 1
    +    } else {
    +      gatherNullStats
    +    }
    +  }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
    +  override def collectedStatistics: Array[Any] = Array[Any](null, null, nullCount, count, 0L)
     }
     
    -private[columnar] class BooleanColumnStats extends ColumnStats {
    +private[columnar] final class BooleanColumnStats extends ColumnStats {
       protected var upper = false
       protected var lower = true
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getBoolean(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BOOLEAN.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Boolean): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BOOLEAN.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ByteColumnStats extends ColumnStats {
    +private[columnar] final class ByteColumnStats extends ColumnStats {
       protected var upper = Byte.MinValue
       protected var lower = Byte.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getByte(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BYTE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Byte): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BYTE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ShortColumnStats extends ColumnStats {
    +private[columnar] final class ShortColumnStats extends ColumnStats {
       protected var upper = Short.MinValue
       protected var lower = Short.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getShort(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += SHORT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Short): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += SHORT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class IntColumnStats extends ColumnStats {
    +private[columnar] final class IntColumnStats extends ColumnStats {
       protected var upper = Int.MinValue
       protected var lower = Int.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getInt(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += INT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Int): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += INT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class LongColumnStats extends ColumnStats {
    +private[columnar] final class LongColumnStats extends ColumnStats {
       protected var upper = Long.MinValue
       protected var lower = Long.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getLong(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += LONG.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Long): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += LONG.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class FloatColumnStats extends ColumnStats {
    +private[columnar] final class FloatColumnStats extends ColumnStats {
       protected var upper = Float.MinValue
       protected var lower = Float.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getFloat(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += FLOAT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Float): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += FLOAT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class DoubleColumnStats extends ColumnStats {
    +private[columnar] final class DoubleColumnStats extends ColumnStats {
       protected var upper = Double.MinValue
       protected var lower = Double.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getDouble(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += DOUBLE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Double): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += DOUBLE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class StringColumnStats extends ColumnStats {
    +private[columnar] final class StringColumnStats extends ColumnStats {
       protected var upper: UTF8String = null
       protected var lower: UTF8String = null
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getUTF8String(ordinal)
    -      if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    -      if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    -      sizeInBytes += STRING.actualSize(row, ordinal)
    +      val size = STRING.actualSize(row, ordinal)
    --- End diff --
    
    Do you want to add the new method `STRING.actualSize(s: UTF8String)`? The current signature `actualSize(row: InternalRow, ordinal: Int)` cannot be changed since it is declared at the super class.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by SparkQA <gi...@git.apache.org>.
Github user SparkQA commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    **[Test build #76975 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/76975/testReport)** for PR 18002 at commit [`11057f4`](https://github.com/apache/spark/commit/11057f4c2ac8f24212e6af7d22223781f752987e).


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by cloud-fan <gi...@git.apache.org>.
Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r116933991
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala ---
    @@ -53,219 +53,297 @@ private[columnar] sealed trait ColumnStats extends Serializable {
       /**
        * Gathers statistics information from `row(ordinal)`.
        */
    -  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    if (row.isNullAt(ordinal)) {
    -      nullCount += 1
    -      // 4 bytes for null position
    -      sizeInBytes += 4
    -    }
    +  def gatherStats(row: InternalRow, ordinal: Int): Unit
    +
    +  /**
    +   * Gathers statistics information on `null`.
    +   */
    +  def gatherNullStats(): Unit = {
    +    nullCount += 1
    +    // 4 bytes for null position
    +    sizeInBytes += 4
         count += 1
       }
     
       /**
    -   * Column statistics represented as a single row, currently including closed lower bound, closed
    +   * Column statistics represented as an array, currently including closed lower bound, closed
        * upper bound and null count.
        */
    -  def collectedStatistics: GenericInternalRow
    +  def collectedStatistics: Array[Any]
     }
     
     /**
      * A no-op ColumnStats only used for testing purposes.
      */
    -private[columnar] class NoopColumnStats extends ColumnStats {
    -  override def gatherStats(row: InternalRow, ordinal: Int): Unit = super.gatherStats(row, ordinal)
    +private[columnar] final class NoopColumnStats extends ColumnStats {
    +  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    +    if (!row.isNullAt(ordinal)) {
    +      count += 1
    +    } else {
    +      super.gatherNullStats
    --- End diff --
    
    do we need the `super` keyword here?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by kiszk <gi...@git.apache.org>.
Github user kiszk commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r117168074
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala ---
    @@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends Serializable {
       /**
        * Gathers statistics information from `row(ordinal)`.
        */
    -  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    if (row.isNullAt(ordinal)) {
    -      nullCount += 1
    -      // 4 bytes for null position
    -      sizeInBytes += 4
    -    }
    +  def gatherStats(row: InternalRow, ordinal: Int): Unit
    +
    +  /**
    +   * Gathers statistics information on `null`.
    +   */
    +  def gatherNullStats(): Unit = {
    +    nullCount += 1
    +    // 4 bytes for null position
    +    sizeInBytes += 4
         count += 1
       }
     
       /**
    -   * Column statistics represented as a single row, currently including closed lower bound, closed
    +   * Column statistics represented as an array, currently including closed lower bound, closed
        * upper bound and null count.
        */
    -  def collectedStatistics: GenericInternalRow
    +  def collectedStatistics: Array[Any]
     }
     
     /**
      * A no-op ColumnStats only used for testing purposes.
      */
    -private[columnar] class NoopColumnStats extends ColumnStats {
    -  override def gatherStats(row: InternalRow, ordinal: Int): Unit = super.gatherStats(row, ordinal)
    +private[columnar] final class NoopColumnStats extends ColumnStats {
    +  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    +    if (!row.isNullAt(ordinal)) {
    +      count += 1
    +    } else {
    +      gatherNullStats
    +    }
    +  }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
    +  override def collectedStatistics: Array[Any] = Array[Any](null, null, nullCount, count, 0L)
     }
     
    -private[columnar] class BooleanColumnStats extends ColumnStats {
    +private[columnar] final class BooleanColumnStats extends ColumnStats {
       protected var upper = false
       protected var lower = true
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getBoolean(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BOOLEAN.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Boolean): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BOOLEAN.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ByteColumnStats extends ColumnStats {
    +private[columnar] final class ByteColumnStats extends ColumnStats {
       protected var upper = Byte.MinValue
       protected var lower = Byte.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getByte(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BYTE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Byte): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BYTE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ShortColumnStats extends ColumnStats {
    +private[columnar] final class ShortColumnStats extends ColumnStats {
       protected var upper = Short.MinValue
       protected var lower = Short.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getShort(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += SHORT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Short): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += SHORT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class IntColumnStats extends ColumnStats {
    +private[columnar] final class IntColumnStats extends ColumnStats {
       protected var upper = Int.MinValue
       protected var lower = Int.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getInt(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += INT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Int): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += INT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class LongColumnStats extends ColumnStats {
    +private[columnar] final class LongColumnStats extends ColumnStats {
       protected var upper = Long.MinValue
       protected var lower = Long.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getLong(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += LONG.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Long): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += LONG.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class FloatColumnStats extends ColumnStats {
    +private[columnar] final class FloatColumnStats extends ColumnStats {
       protected var upper = Float.MinValue
       protected var lower = Float.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getFloat(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += FLOAT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Float): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += FLOAT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class DoubleColumnStats extends ColumnStats {
    +private[columnar] final class DoubleColumnStats extends ColumnStats {
       protected var upper = Double.MinValue
       protected var lower = Double.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getDouble(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += DOUBLE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Double): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += DOUBLE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class StringColumnStats extends ColumnStats {
    +private[columnar] final class StringColumnStats extends ColumnStats {
       protected var upper: UTF8String = null
       protected var lower: UTF8String = null
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getUTF8String(ordinal)
    -      if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    -      if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    -      sizeInBytes += STRING.actualSize(row, ordinal)
    +      val size = STRING.actualSize(row, ordinal)
    +      gatherValueStats(value, size)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: UTF8String, size: Int): Unit = {
    +    if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    +    if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    +    sizeInBytes += size
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class BinaryColumnStats extends ColumnStats {
    +private[columnar] final class BinaryColumnStats extends ColumnStats {
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
    -      sizeInBytes += BINARY.actualSize(row, ordinal)
    +      val size = BINARY.actualSize(row, ordinal)
    +      sizeInBytes += size
    +      count += 1
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, sizeInBytes))
    +  def gatherValueStats(size: Int): Unit = {
    +    sizeInBytes += size
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](null, null, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class DecimalColumnStats(precision: Int, scale: Int) extends ColumnStats {
    +private[columnar] final class DecimalColumnStats(precision: Int, scale: Int) extends ColumnStats {
       def this(dt: DecimalType) = this(dt.precision, dt.scale)
     
       protected var upper: Decimal = null
       protected var lower: Decimal = null
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getDecimal(ordinal, precision, scale)
    -      if (upper == null || value.compareTo(upper) > 0) upper = value
    -      if (lower == null || value.compareTo(lower) < 0) lower = value
           // TODO: this is not right for DecimalType with precision > 18
    -      sizeInBytes += 8
    +      val size = 8
    --- End diff --
    
    Thanks, done


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by dongjoon-hyun <gi...@git.apache.org>.
Github user dongjoon-hyun commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    +1, LGTM, too.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by kiszk <gi...@git.apache.org>.
Github user kiszk commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    @cloud-fan could you please let me know if I have to do additional things for this PR?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by AmplabJenkins <gi...@git.apache.org>.
Github user AmplabJenkins commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    Test PASSed.
    Refer to this link for build results (access rights to CI server needed): 
    https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/77050/
    Test PASSed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by viirya <gi...@git.apache.org>.
Github user viirya commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    LGTM


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by viirya <gi...@git.apache.org>.
Github user viirya commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r116938870
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala ---
    @@ -53,219 +53,297 @@ private[columnar] sealed trait ColumnStats extends Serializable {
       /**
        * Gathers statistics information from `row(ordinal)`.
        */
    -  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    if (row.isNullAt(ordinal)) {
    -      nullCount += 1
    -      // 4 bytes for null position
    -      sizeInBytes += 4
    -    }
    +  def gatherStats(row: InternalRow, ordinal: Int): Unit
    +
    +  /**
    +   * Gathers statistics information on `null`.
    +   */
    +  def gatherNullStats(): Unit = {
    +    nullCount += 1
    +    // 4 bytes for null position
    +    sizeInBytes += 4
         count += 1
       }
     
       /**
    -   * Column statistics represented as a single row, currently including closed lower bound, closed
    +   * Column statistics represented as an array, currently including closed lower bound, closed
        * upper bound and null count.
        */
    -  def collectedStatistics: GenericInternalRow
    +  def collectedStatistics: Array[Any]
     }
     
     /**
      * A no-op ColumnStats only used for testing purposes.
      */
    -private[columnar] class NoopColumnStats extends ColumnStats {
    -  override def gatherStats(row: InternalRow, ordinal: Int): Unit = super.gatherStats(row, ordinal)
    +private[columnar] final class NoopColumnStats extends ColumnStats {
    +  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    +    if (!row.isNullAt(ordinal)) {
    +      count += 1
    +    } else {
    +      super.gatherNullStats
    +    }
    +  }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
    +  override def collectedStatistics: Array[Any] = Array[Any](null, null, nullCount, count, 0L)
     }
     
    -private[columnar] class BooleanColumnStats extends ColumnStats {
    +private[columnar] final class BooleanColumnStats extends ColumnStats {
       protected var upper = false
       protected var lower = true
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getBoolean(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BOOLEAN.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Boolean): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BOOLEAN.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ByteColumnStats extends ColumnStats {
    +private[columnar] final class ByteColumnStats extends ColumnStats {
       protected var upper = Byte.MinValue
       protected var lower = Byte.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getByte(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BYTE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Byte): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BYTE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ShortColumnStats extends ColumnStats {
    +private[columnar] final class ShortColumnStats extends ColumnStats {
       protected var upper = Short.MinValue
       protected var lower = Short.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getShort(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += SHORT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Short): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += SHORT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class IntColumnStats extends ColumnStats {
    +private[columnar] final class IntColumnStats extends ColumnStats {
       protected var upper = Int.MinValue
       protected var lower = Int.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getInt(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += INT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Int): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += INT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class LongColumnStats extends ColumnStats {
    +private[columnar] final class LongColumnStats extends ColumnStats {
       protected var upper = Long.MinValue
       protected var lower = Long.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getLong(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += LONG.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Long): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += LONG.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class FloatColumnStats extends ColumnStats {
    +private[columnar] final class FloatColumnStats extends ColumnStats {
       protected var upper = Float.MinValue
       protected var lower = Float.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getFloat(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += FLOAT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Float): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += FLOAT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class DoubleColumnStats extends ColumnStats {
    +private[columnar] final class DoubleColumnStats extends ColumnStats {
       protected var upper = Double.MinValue
       protected var lower = Double.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getDouble(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += DOUBLE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Double): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += DOUBLE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class StringColumnStats extends ColumnStats {
    +private[columnar] final class StringColumnStats extends ColumnStats {
       protected var upper: UTF8String = null
       protected var lower: UTF8String = null
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getUTF8String(ordinal)
    -      if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    -      if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    -      sizeInBytes += STRING.actualSize(row, ordinal)
    +      val size = STRING.actualSize(row, ordinal)
    +      gatherValueStats(value, size)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: UTF8String, size: Int): Unit = {
    +    if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    +    if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    +    sizeInBytes += size
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class BinaryColumnStats extends ColumnStats {
    +private[columnar] final class BinaryColumnStats extends ColumnStats {
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
    -      sizeInBytes += BINARY.actualSize(row, ordinal)
    +      val size = BINARY.actualSize(row, ordinal)
    +      gatherValueStats(size)
    --- End diff --
    
    Nit: we may not need `gatherValueStats`  here. Simply inline:
    
        sizeInBytes += BINARY.actualSize(row, ordinal)
        count += 1


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by kiszk <gi...@git.apache.org>.
Github user kiszk commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    Thank you very much


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by cloud-fan <gi...@git.apache.org>.
Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r116934486
  
    --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala ---
    @@ -33,18 +32,18 @@ class ColumnStatsSuite extends SparkFunSuite {
       testColumnStats(classOf[StringColumnStats], STRING, createRow(null, null, 0))
       testDecimalColumnStats(createRow(null, null, 0))
     
    -  def createRow(values: Any*): GenericInternalRow = new GenericInternalRow(values.toArray)
    +  def createRow(values: Any*): Array[Any] = values.toArray
    --- End diff --
    
    do we still need this method?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by viirya <gi...@git.apache.org>.
Github user viirya commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r116940295
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala ---
    @@ -53,219 +53,297 @@ private[columnar] sealed trait ColumnStats extends Serializable {
       /**
        * Gathers statistics information from `row(ordinal)`.
        */
    -  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    if (row.isNullAt(ordinal)) {
    -      nullCount += 1
    -      // 4 bytes for null position
    -      sizeInBytes += 4
    -    }
    +  def gatherStats(row: InternalRow, ordinal: Int): Unit
    +
    +  /**
    +   * Gathers statistics information on `null`.
    +   */
    +  def gatherNullStats(): Unit = {
    +    nullCount += 1
    +    // 4 bytes for null position
    +    sizeInBytes += 4
         count += 1
       }
     
       /**
    -   * Column statistics represented as a single row, currently including closed lower bound, closed
    +   * Column statistics represented as an array, currently including closed lower bound, closed
        * upper bound and null count.
        */
    -  def collectedStatistics: GenericInternalRow
    +  def collectedStatistics: Array[Any]
     }
     
     /**
      * A no-op ColumnStats only used for testing purposes.
      */
    -private[columnar] class NoopColumnStats extends ColumnStats {
    -  override def gatherStats(row: InternalRow, ordinal: Int): Unit = super.gatherStats(row, ordinal)
    +private[columnar] final class NoopColumnStats extends ColumnStats {
    +  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    +    if (!row.isNullAt(ordinal)) {
    +      count += 1
    +    } else {
    +      super.gatherNullStats
    +    }
    +  }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
    +  override def collectedStatistics: Array[Any] = Array[Any](null, null, nullCount, count, 0L)
     }
     
    -private[columnar] class BooleanColumnStats extends ColumnStats {
    +private[columnar] final class BooleanColumnStats extends ColumnStats {
       protected var upper = false
       protected var lower = true
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getBoolean(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BOOLEAN.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Boolean): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BOOLEAN.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ByteColumnStats extends ColumnStats {
    +private[columnar] final class ByteColumnStats extends ColumnStats {
       protected var upper = Byte.MinValue
       protected var lower = Byte.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getByte(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BYTE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Byte): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BYTE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ShortColumnStats extends ColumnStats {
    +private[columnar] final class ShortColumnStats extends ColumnStats {
       protected var upper = Short.MinValue
       protected var lower = Short.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getShort(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += SHORT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Short): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += SHORT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class IntColumnStats extends ColumnStats {
    +private[columnar] final class IntColumnStats extends ColumnStats {
       protected var upper = Int.MinValue
       protected var lower = Int.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getInt(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += INT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Int): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += INT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class LongColumnStats extends ColumnStats {
    +private[columnar] final class LongColumnStats extends ColumnStats {
       protected var upper = Long.MinValue
       protected var lower = Long.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getLong(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += LONG.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Long): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += LONG.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class FloatColumnStats extends ColumnStats {
    +private[columnar] final class FloatColumnStats extends ColumnStats {
       protected var upper = Float.MinValue
       protected var lower = Float.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getFloat(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += FLOAT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Float): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += FLOAT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class DoubleColumnStats extends ColumnStats {
    +private[columnar] final class DoubleColumnStats extends ColumnStats {
       protected var upper = Double.MinValue
       protected var lower = Double.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getDouble(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += DOUBLE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Double): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += DOUBLE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class StringColumnStats extends ColumnStats {
    +private[columnar] final class StringColumnStats extends ColumnStats {
       protected var upper: UTF8String = null
       protected var lower: UTF8String = null
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getUTF8String(ordinal)
    -      if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    -      if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    -      sizeInBytes += STRING.actualSize(row, ordinal)
    +      val size = STRING.actualSize(row, ordinal)
    +      gatherValueStats(value, size)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: UTF8String, size: Int): Unit = {
    +    if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    +    if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    +    sizeInBytes += size
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class BinaryColumnStats extends ColumnStats {
    +private[columnar] final class BinaryColumnStats extends ColumnStats {
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
    -      sizeInBytes += BINARY.actualSize(row, ordinal)
    +      val size = BINARY.actualSize(row, ordinal)
    +      gatherValueStats(size)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, sizeInBytes))
    +  def gatherValueStats(size: Int): Unit = {
    +    sizeInBytes += size
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](null, null, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class DecimalColumnStats(precision: Int, scale: Int) extends ColumnStats {
    +private[columnar] final class DecimalColumnStats(precision: Int, scale: Int) extends ColumnStats {
       def this(dt: DecimalType) = this(dt.precision, dt.scale)
     
       protected var upper: Decimal = null
       protected var lower: Decimal = null
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getDecimal(ordinal, precision, scale)
    -      if (upper == null || value.compareTo(upper) > 0) upper = value
    -      if (lower == null || value.compareTo(lower) < 0) lower = value
           // TODO: this is not right for DecimalType with precision > 18
    -      sizeInBytes += 8
    +      val size = 8
    +      gatherValueStats(value, size)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Decimal, size: Int): Unit = {
    +    if (upper == null || value.compareTo(upper) > 0) upper = value
    +    if (lower == null || value.compareTo(lower) < 0) lower = value
    +    sizeInBytes += size
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ObjectColumnStats(dataType: DataType) extends ColumnStats {
    +private[columnar] final class ObjectColumnStats(dataType: DataType) extends ColumnStats {
       val columnType = ColumnType(dataType)
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
    -      sizeInBytes += columnType.actualSize(row, ordinal)
    +      val size = columnType.actualSize(row, ordinal)
    +      gatherValueStats(size)
    +    } else {
    +      super.gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, sizeInBytes))
    +  def gatherValueStats(size: Int): Unit = {
    --- End diff --
    
    Nit: we can inline this too.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by cloud-fan <gi...@git.apache.org>.
Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r117153570
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala ---
    @@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends Serializable {
       /**
        * Gathers statistics information from `row(ordinal)`.
        */
    -  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    if (row.isNullAt(ordinal)) {
    -      nullCount += 1
    -      // 4 bytes for null position
    -      sizeInBytes += 4
    -    }
    +  def gatherStats(row: InternalRow, ordinal: Int): Unit
    +
    +  /**
    +   * Gathers statistics information on `null`.
    +   */
    +  def gatherNullStats(): Unit = {
    +    nullCount += 1
    +    // 4 bytes for null position
    +    sizeInBytes += 4
         count += 1
       }
     
       /**
    -   * Column statistics represented as a single row, currently including closed lower bound, closed
    +   * Column statistics represented as an array, currently including closed lower bound, closed
        * upper bound and null count.
        */
    -  def collectedStatistics: GenericInternalRow
    +  def collectedStatistics: Array[Any]
     }
     
     /**
      * A no-op ColumnStats only used for testing purposes.
      */
    -private[columnar] class NoopColumnStats extends ColumnStats {
    -  override def gatherStats(row: InternalRow, ordinal: Int): Unit = super.gatherStats(row, ordinal)
    +private[columnar] final class NoopColumnStats extends ColumnStats {
    +  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    +    if (!row.isNullAt(ordinal)) {
    +      count += 1
    +    } else {
    +      gatherNullStats
    +    }
    +  }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
    +  override def collectedStatistics: Array[Any] = Array[Any](null, null, nullCount, count, 0L)
     }
     
    -private[columnar] class BooleanColumnStats extends ColumnStats {
    +private[columnar] final class BooleanColumnStats extends ColumnStats {
       protected var upper = false
       protected var lower = true
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getBoolean(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BOOLEAN.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Boolean): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BOOLEAN.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ByteColumnStats extends ColumnStats {
    +private[columnar] final class ByteColumnStats extends ColumnStats {
       protected var upper = Byte.MinValue
       protected var lower = Byte.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getByte(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BYTE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Byte): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BYTE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ShortColumnStats extends ColumnStats {
    +private[columnar] final class ShortColumnStats extends ColumnStats {
       protected var upper = Short.MinValue
       protected var lower = Short.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getShort(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += SHORT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Short): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += SHORT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class IntColumnStats extends ColumnStats {
    +private[columnar] final class IntColumnStats extends ColumnStats {
       protected var upper = Int.MinValue
       protected var lower = Int.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getInt(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += INT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Int): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += INT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class LongColumnStats extends ColumnStats {
    +private[columnar] final class LongColumnStats extends ColumnStats {
       protected var upper = Long.MinValue
       protected var lower = Long.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getLong(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += LONG.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Long): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += LONG.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class FloatColumnStats extends ColumnStats {
    +private[columnar] final class FloatColumnStats extends ColumnStats {
       protected var upper = Float.MinValue
       protected var lower = Float.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getFloat(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += FLOAT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Float): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += FLOAT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class DoubleColumnStats extends ColumnStats {
    +private[columnar] final class DoubleColumnStats extends ColumnStats {
       protected var upper = Double.MinValue
       protected var lower = Double.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getDouble(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += DOUBLE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Double): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += DOUBLE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class StringColumnStats extends ColumnStats {
    +private[columnar] final class StringColumnStats extends ColumnStats {
       protected var upper: UTF8String = null
       protected var lower: UTF8String = null
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getUTF8String(ordinal)
    -      if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    -      if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    -      sizeInBytes += STRING.actualSize(row, ordinal)
    +      val size = STRING.actualSize(row, ordinal)
    +      gatherValueStats(value, size)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: UTF8String, size: Int): Unit = {
    +    if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    +    if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    +    sizeInBytes += size
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class BinaryColumnStats extends ColumnStats {
    +private[columnar] final class BinaryColumnStats extends ColumnStats {
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
    -      sizeInBytes += BINARY.actualSize(row, ordinal)
    +      val size = BINARY.actualSize(row, ordinal)
    +      sizeInBytes += size
    +      count += 1
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, sizeInBytes))
    +  def gatherValueStats(size: Int): Unit = {
    +    sizeInBytes += size
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](null, null, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class DecimalColumnStats(precision: Int, scale: Int) extends ColumnStats {
    +private[columnar] final class DecimalColumnStats(precision: Int, scale: Int) extends ColumnStats {
       def this(dt: DecimalType) = this(dt.precision, dt.scale)
     
       protected var upper: Decimal = null
       protected var lower: Decimal = null
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getDecimal(ordinal, precision, scale)
    -      if (upper == null || value.compareTo(upper) > 0) upper = value
    -      if (lower == null || value.compareTo(lower) < 0) lower = value
           // TODO: this is not right for DecimalType with precision > 18
    -      sizeInBytes += 8
    +      val size = 8
    --- End diff --
    
    can we just hardcode `8` in `gatherValueStats`?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by kiszk <gi...@git.apache.org>.
Github user kiszk commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    @hvanhovell would it be possible to take a look?
    cc @cloud-fan 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by SparkQA <gi...@git.apache.org>.
Github user SparkQA commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    **[Test build #77029 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/77029/testReport)** for PR 18002 at commit [`3e3ffde`](https://github.com/apache/spark/commit/3e3ffde9c566e54474fad0c7508544606ddddef3).


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by cloud-fan <gi...@git.apache.org>.
Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r117167259
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala ---
    @@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends Serializable {
       /**
        * Gathers statistics information from `row(ordinal)`.
        */
    -  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    if (row.isNullAt(ordinal)) {
    -      nullCount += 1
    -      // 4 bytes for null position
    -      sizeInBytes += 4
    -    }
    +  def gatherStats(row: InternalRow, ordinal: Int): Unit
    +
    +  /**
    +   * Gathers statistics information on `null`.
    +   */
    +  def gatherNullStats(): Unit = {
    +    nullCount += 1
    +    // 4 bytes for null position
    +    sizeInBytes += 4
         count += 1
       }
     
       /**
    -   * Column statistics represented as a single row, currently including closed lower bound, closed
    +   * Column statistics represented as an array, currently including closed lower bound, closed
        * upper bound and null count.
        */
    -  def collectedStatistics: GenericInternalRow
    +  def collectedStatistics: Array[Any]
     }
     
     /**
      * A no-op ColumnStats only used for testing purposes.
      */
    -private[columnar] class NoopColumnStats extends ColumnStats {
    -  override def gatherStats(row: InternalRow, ordinal: Int): Unit = super.gatherStats(row, ordinal)
    +private[columnar] final class NoopColumnStats extends ColumnStats {
    +  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    +    if (!row.isNullAt(ordinal)) {
    +      count += 1
    +    } else {
    +      gatherNullStats
    +    }
    +  }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
    +  override def collectedStatistics: Array[Any] = Array[Any](null, null, nullCount, count, 0L)
     }
     
    -private[columnar] class BooleanColumnStats extends ColumnStats {
    +private[columnar] final class BooleanColumnStats extends ColumnStats {
       protected var upper = false
       protected var lower = true
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getBoolean(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BOOLEAN.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Boolean): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BOOLEAN.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ByteColumnStats extends ColumnStats {
    +private[columnar] final class ByteColumnStats extends ColumnStats {
       protected var upper = Byte.MinValue
       protected var lower = Byte.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getByte(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BYTE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Byte): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BYTE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ShortColumnStats extends ColumnStats {
    +private[columnar] final class ShortColumnStats extends ColumnStats {
       protected var upper = Short.MinValue
       protected var lower = Short.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getShort(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += SHORT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Short): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += SHORT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class IntColumnStats extends ColumnStats {
    +private[columnar] final class IntColumnStats extends ColumnStats {
       protected var upper = Int.MinValue
       protected var lower = Int.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getInt(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += INT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Int): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += INT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class LongColumnStats extends ColumnStats {
    +private[columnar] final class LongColumnStats extends ColumnStats {
       protected var upper = Long.MinValue
       protected var lower = Long.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getLong(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += LONG.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Long): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += LONG.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class FloatColumnStats extends ColumnStats {
    +private[columnar] final class FloatColumnStats extends ColumnStats {
       protected var upper = Float.MinValue
       protected var lower = Float.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getFloat(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += FLOAT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Float): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += FLOAT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class DoubleColumnStats extends ColumnStats {
    +private[columnar] final class DoubleColumnStats extends ColumnStats {
       protected var upper = Double.MinValue
       protected var lower = Double.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getDouble(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += DOUBLE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Double): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += DOUBLE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class StringColumnStats extends ColumnStats {
    +private[columnar] final class StringColumnStats extends ColumnStats {
       protected var upper: UTF8String = null
       protected var lower: UTF8String = null
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getUTF8String(ordinal)
    -      if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    -      if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    -      sizeInBytes += STRING.actualSize(row, ordinal)
    +      val size = STRING.actualSize(row, ordinal)
    --- End diff --
    
    I mean we can just pass the UTF8String to STRING.actualSize


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by SparkQA <gi...@git.apache.org>.
Github user SparkQA commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    **[Test build #76975 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/76975/testReport)** for PR 18002 at commit [`11057f4`](https://github.com/apache/spark/commit/11057f4c2ac8f24212e6af7d22223781f752987e).
     * This patch passes all tests.
     * This patch merges cleanly.
     * This patch adds no public classes.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by SparkQA <gi...@git.apache.org>.
Github user SparkQA commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    **[Test build #77050 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/77050/testReport)** for PR 18002 at commit [`66fefb6`](https://github.com/apache/spark/commit/66fefb6a8a0ddf1d1dc9318e5320506987da9c59).
     * This patch passes all tests.
     * This patch merges cleanly.
     * This patch adds no public classes.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by AmplabJenkins <gi...@git.apache.org>.
Github user AmplabJenkins commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    Test PASSed.
    Refer to this link for build results (access rights to CI server needed): 
    https://amplab.cs.berkeley.edu/jenkins//job/SparkPullRequestBuilder/76975/
    Test PASSed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by kiszk <gi...@git.apache.org>.
Github user kiszk commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r117167072
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala ---
    @@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends Serializable {
       /**
        * Gathers statistics information from `row(ordinal)`.
        */
    -  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    if (row.isNullAt(ordinal)) {
    -      nullCount += 1
    -      // 4 bytes for null position
    -      sizeInBytes += 4
    -    }
    +  def gatherStats(row: InternalRow, ordinal: Int): Unit
    +
    +  /**
    +   * Gathers statistics information on `null`.
    +   */
    +  def gatherNullStats(): Unit = {
    +    nullCount += 1
    +    // 4 bytes for null position
    +    sizeInBytes += 4
         count += 1
       }
     
       /**
    -   * Column statistics represented as a single row, currently including closed lower bound, closed
    +   * Column statistics represented as an array, currently including closed lower bound, closed
        * upper bound and null count.
        */
    -  def collectedStatistics: GenericInternalRow
    +  def collectedStatistics: Array[Any]
     }
     
     /**
      * A no-op ColumnStats only used for testing purposes.
      */
    -private[columnar] class NoopColumnStats extends ColumnStats {
    -  override def gatherStats(row: InternalRow, ordinal: Int): Unit = super.gatherStats(row, ordinal)
    +private[columnar] final class NoopColumnStats extends ColumnStats {
    +  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    +    if (!row.isNullAt(ordinal)) {
    +      count += 1
    +    } else {
    +      gatherNullStats
    +    }
    +  }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
    +  override def collectedStatistics: Array[Any] = Array[Any](null, null, nullCount, count, 0L)
     }
     
    -private[columnar] class BooleanColumnStats extends ColumnStats {
    +private[columnar] final class BooleanColumnStats extends ColumnStats {
       protected var upper = false
       protected var lower = true
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getBoolean(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BOOLEAN.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Boolean): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BOOLEAN.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ByteColumnStats extends ColumnStats {
    +private[columnar] final class ByteColumnStats extends ColumnStats {
       protected var upper = Byte.MinValue
       protected var lower = Byte.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getByte(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BYTE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Byte): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BYTE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ShortColumnStats extends ColumnStats {
    +private[columnar] final class ShortColumnStats extends ColumnStats {
       protected var upper = Short.MinValue
       protected var lower = Short.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getShort(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += SHORT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Short): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += SHORT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class IntColumnStats extends ColumnStats {
    +private[columnar] final class IntColumnStats extends ColumnStats {
       protected var upper = Int.MinValue
       protected var lower = Int.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getInt(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += INT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Int): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += INT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class LongColumnStats extends ColumnStats {
    +private[columnar] final class LongColumnStats extends ColumnStats {
       protected var upper = Long.MinValue
       protected var lower = Long.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getLong(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += LONG.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Long): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += LONG.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class FloatColumnStats extends ColumnStats {
    +private[columnar] final class FloatColumnStats extends ColumnStats {
       protected var upper = Float.MinValue
       protected var lower = Float.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getFloat(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += FLOAT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Float): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += FLOAT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class DoubleColumnStats extends ColumnStats {
    +private[columnar] final class DoubleColumnStats extends ColumnStats {
       protected var upper = Double.MinValue
       protected var lower = Double.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getDouble(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += DOUBLE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Double): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += DOUBLE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class StringColumnStats extends ColumnStats {
    +private[columnar] final class StringColumnStats extends ColumnStats {
       protected var upper: UTF8String = null
       protected var lower: UTF8String = null
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getUTF8String(ordinal)
    -      if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    -      if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    -      sizeInBytes += STRING.actualSize(row, ordinal)
    +      val size = STRING.actualSize(row, ordinal)
    --- End diff --
    
    I may not understand your point.
    Do you want to use `row.getUTF8String(ordinal).numBytes() + 4` instead of calling `STRING.actualSize()`? (i.e. method inlining).


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark issue #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by cloud-fan <gi...@git.apache.org>.
Github user cloud-fan commented on the issue:

    https://github.com/apache/spark/pull/18002
  
    LGTM


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org


[GitHub] spark pull request #18002: [SPARK-20770][SQL] Improve ColumnStats

Posted by kiszk <gi...@git.apache.org>.
Github user kiszk commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18002#discussion_r117168094
  
    --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala ---
    @@ -53,219 +53,299 @@ private[columnar] sealed trait ColumnStats extends Serializable {
       /**
        * Gathers statistics information from `row(ordinal)`.
        */
    -  def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    if (row.isNullAt(ordinal)) {
    -      nullCount += 1
    -      // 4 bytes for null position
    -      sizeInBytes += 4
    -    }
    +  def gatherStats(row: InternalRow, ordinal: Int): Unit
    +
    +  /**
    +   * Gathers statistics information on `null`.
    +   */
    +  def gatherNullStats(): Unit = {
    +    nullCount += 1
    +    // 4 bytes for null position
    +    sizeInBytes += 4
         count += 1
       }
     
       /**
    -   * Column statistics represented as a single row, currently including closed lower bound, closed
    +   * Column statistics represented as an array, currently including closed lower bound, closed
        * upper bound and null count.
        */
    -  def collectedStatistics: GenericInternalRow
    +  def collectedStatistics: Array[Any]
     }
     
     /**
      * A no-op ColumnStats only used for testing purposes.
      */
    -private[columnar] class NoopColumnStats extends ColumnStats {
    -  override def gatherStats(row: InternalRow, ordinal: Int): Unit = super.gatherStats(row, ordinal)
    +private[columnar] final class NoopColumnStats extends ColumnStats {
    +  override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    +    if (!row.isNullAt(ordinal)) {
    +      count += 1
    +    } else {
    +      gatherNullStats
    +    }
    +  }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, 0L))
    +  override def collectedStatistics: Array[Any] = Array[Any](null, null, nullCount, count, 0L)
     }
     
    -private[columnar] class BooleanColumnStats extends ColumnStats {
    +private[columnar] final class BooleanColumnStats extends ColumnStats {
       protected var upper = false
       protected var lower = true
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getBoolean(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BOOLEAN.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Boolean): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BOOLEAN.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ByteColumnStats extends ColumnStats {
    +private[columnar] final class ByteColumnStats extends ColumnStats {
       protected var upper = Byte.MinValue
       protected var lower = Byte.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getByte(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += BYTE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Byte): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += BYTE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class ShortColumnStats extends ColumnStats {
    +private[columnar] final class ShortColumnStats extends ColumnStats {
       protected var upper = Short.MinValue
       protected var lower = Short.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getShort(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += SHORT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Short): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += SHORT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class IntColumnStats extends ColumnStats {
    +private[columnar] final class IntColumnStats extends ColumnStats {
       protected var upper = Int.MinValue
       protected var lower = Int.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getInt(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += INT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Int): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += INT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class LongColumnStats extends ColumnStats {
    +private[columnar] final class LongColumnStats extends ColumnStats {
       protected var upper = Long.MinValue
       protected var lower = Long.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getLong(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += LONG.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Long): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += LONG.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class FloatColumnStats extends ColumnStats {
    +private[columnar] final class FloatColumnStats extends ColumnStats {
       protected var upper = Float.MinValue
       protected var lower = Float.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getFloat(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += FLOAT.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Float): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += FLOAT.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class DoubleColumnStats extends ColumnStats {
    +private[columnar] final class DoubleColumnStats extends ColumnStats {
       protected var upper = Double.MinValue
       protected var lower = Double.MaxValue
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getDouble(ordinal)
    -      if (value > upper) upper = value
    -      if (value < lower) lower = value
    -      sizeInBytes += DOUBLE.defaultSize
    +      gatherValueStats(value)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: Double): Unit = {
    +    if (value > upper) upper = value
    +    if (value < lower) lower = value
    +    sizeInBytes += DOUBLE.defaultSize
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class StringColumnStats extends ColumnStats {
    +private[columnar] final class StringColumnStats extends ColumnStats {
       protected var upper: UTF8String = null
       protected var lower: UTF8String = null
     
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
           val value = row.getUTF8String(ordinal)
    -      if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    -      if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    -      sizeInBytes += STRING.actualSize(row, ordinal)
    +      val size = STRING.actualSize(row, ordinal)
    +      gatherValueStats(value, size)
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](lower, upper, nullCount, count, sizeInBytes))
    +  def gatherValueStats(value: UTF8String, size: Int): Unit = {
    +    if (upper == null || value.compareTo(upper) > 0) upper = value.clone()
    +    if (lower == null || value.compareTo(lower) < 0) lower = value.clone()
    +    sizeInBytes += size
    +    count += 1
    +  }
    +
    +  override def collectedStatistics: Array[Any] =
    +    Array[Any](lower, upper, nullCount, count, sizeInBytes)
     }
     
    -private[columnar] class BinaryColumnStats extends ColumnStats {
    +private[columnar] final class BinaryColumnStats extends ColumnStats {
       override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
    -    super.gatherStats(row, ordinal)
         if (!row.isNullAt(ordinal)) {
    -      sizeInBytes += BINARY.actualSize(row, ordinal)
    +      val size = BINARY.actualSize(row, ordinal)
    +      sizeInBytes += size
    +      count += 1
    +    } else {
    +      gatherNullStats
         }
       }
     
    -  override def collectedStatistics: GenericInternalRow =
    -    new GenericInternalRow(Array[Any](null, null, nullCount, count, sizeInBytes))
    +  def gatherValueStats(size: Int): Unit = {
    --- End diff --
    
    Sure, removed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org