You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Dongjoon Hyun (Jira)" <ji...@apache.org> on 2020/03/16 22:52:07 UTC
[jira] [Updated] (SPARK-27843) Remove duplicate logic of calculate table size

     [ https://issues.apache.org/jira/browse/SPARK-27843?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Dongjoon Hyun updated SPARK-27843:
----------------------------------
    Affects Version/s:     (was: 3.0.0)
                       3.1.0

> Remove duplicate logic of calculate table size
> ----------------------------------------------
>
>                 Key: SPARK-27843
>                 URL: https://issues.apache.org/jira/browse/SPARK-27843
>             Project: Spark
>          Issue Type: Improvement
>          Components: SQL
>    Affects Versions: 3.1.0
>            Reporter: Yuming Wang
>            Priority: Major
>
> {code:scala}
> class DetermineTableStats(session: SparkSession) extends Rule[LogicalPlan] {
>   override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
>     case relation: HiveTableRelation
>         if DDLUtils.isHiveTable(relation.tableMeta) && relation.tableMeta.stats.isEmpty =>
>       val table = relation.tableMeta
>       val sizeInBytes = if (session.sessionState.conf.fallBackToHdfsForStatsEnabled) {
>         try {
>           val hadoopConf = session.sessionState.newHadoopConf()
>           val tablePath = new Path(table.location)
>           val fs: FileSystem = tablePath.getFileSystem(hadoopConf)
>           fs.getContentSummary(tablePath).getLength
>         } catch {
>           case e: IOException =>
>             logWarning("Failed to get table size from hdfs.", e)
>             session.sessionState.conf.defaultSizeInBytes
>         }
>       } else {
>         session.sessionState.conf.defaultSizeInBytes
>       }
>       val withStats = table.copy(stats = Some(CatalogStatistics(sizeInBytes = BigInt(sizeInBytes))))
>       relation.copy(tableMeta = withStats)
>   }
> }
> {code}
> and
> {code:scala}
> def calculateTotalSize(spark: SparkSession, catalogTable: CatalogTable): BigInt = {
>     val sessionState = spark.sessionState
>     if (catalogTable.partitionColumnNames.isEmpty) {
>       calculateLocationSize(sessionState, catalogTable.identifier, catalogTable.storage.locationUri)
>     } else {
>       // Calculate table size as a sum of the visible partitions. See SPARK-21079
>       val partitions = sessionState.catalog.listPartitions(catalogTable.identifier)
>       if (spark.sessionState.conf.parallelFileListingInStatsComputation) {
>         val paths = partitions.map(x => new Path(x.storage.locationUri.get))
>         val stagingDir = sessionState.conf.getConfString("hive.exec.stagingdir", ".hive-staging")
>         val pathFilter = new PathFilter with Serializable {
>           override def accept(path: Path): Boolean = {
>             DataSourceUtils.isDataPath(path) && !path.getName.startsWith(stagingDir)
>           }
>         }
>         val fileStatusSeq = InMemoryFileIndex.bulkListLeafFiles(
>           paths, sessionState.newHadoopConf(), pathFilter, spark)
>         fileStatusSeq.flatMap(_._2.map(_.getLen)).sum
>       } else {
>         partitions.map { p =>
>           calculateLocationSize(sessionState, catalogTable.identifier, p.storage.locationUri)
>         }.sum
>       }
>     }
>   }
> {code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org