You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by barrenlake <gi...@git.apache.org> on 2017/12/04 07:46:23 UTC

[GitHub] spark pull request #17176: [SPARK-19833][SQL]remove SQLConf.HIVE_VERIFY_PART...

Github user barrenlake commented on a diff in the pull request:

    https://github.com/apache/spark/pull/17176#discussion_r154575331
  
    --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala ---
    @@ -159,36 +159,11 @@ class HadoopTableReader(
         def verifyPartitionPath(
             partitionToDeserializer: Map[HivePartition, Class[_ <: Deserializer]]):
             Map[HivePartition, Class[_ <: Deserializer]] = {
    -      if (!sparkSession.sessionState.conf.verifyPartitionPath) {
    -        partitionToDeserializer
    -      } else {
    -        var existPathSet = collection.mutable.Set[String]()
    -        var pathPatternSet = collection.mutable.Set[String]()
    -        partitionToDeserializer.filter {
    -          case (partition, partDeserializer) =>
    -            def updateExistPathSetByPathPattern(pathPatternStr: String) {
    -              val pathPattern = new Path(pathPatternStr)
    -              val fs = pathPattern.getFileSystem(hadoopConf)
    -              val matches = fs.globStatus(pathPattern)
    -              matches.foreach(fileStatus => existPathSet += fileStatus.getPath.toString)
    -            }
    -            // convert  /demo/data/year/month/day  to  /demo/data/*/*/*/
    -            def getPathPatternByPath(parNum: Int, tempPath: Path): String = {
    -              var path = tempPath
    -              for (i <- (1 to parNum)) path = path.getParent
    -              val tails = (1 to parNum).map(_ => "*").mkString("/", "/", "/")
    -              path.toString + tails
    -            }
    -
    -            val partPath = partition.getDataLocation
    -            val partNum = Utilities.getPartitionDesc(partition).getPartSpec.size();
    -            var pathPatternStr = getPathPatternByPath(partNum, partPath)
    -            if (!pathPatternSet.contains(pathPatternStr)) {
    -              pathPatternSet += pathPatternStr
    -              updateExistPathSetByPathPattern(pathPatternStr)
    -            }
    -            existPathSet.contains(partPath.toString)
    -        }
    +      partitionToDeserializer.filter {
    +        case (partition, partDeserializer) =>
    +          val partPath = partition.getDataLocation
    +          val fs = partPath.getFileSystem(hadoopConf)
    +          fs.exists(partPath)
    --- End diff --
    
    Each partition sending an RPC request to the NameNode can result in poor performance


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org