You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by barrenlake <gi...@git.apache.org> on 2017/12/04 07:46:23 UTC
[GitHub] spark pull request #17176: [SPARK-19833][SQL]remove SQLConf.HIVE_VERIFY_PART...
Github user barrenlake commented on a diff in the pull request:
https://github.com/apache/spark/pull/17176#discussion_r154575331
--- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala ---
@@ -159,36 +159,11 @@ class HadoopTableReader(
def verifyPartitionPath(
partitionToDeserializer: Map[HivePartition, Class[_ <: Deserializer]]):
Map[HivePartition, Class[_ <: Deserializer]] = {
- if (!sparkSession.sessionState.conf.verifyPartitionPath) {
- partitionToDeserializer
- } else {
- var existPathSet = collection.mutable.Set[String]()
- var pathPatternSet = collection.mutable.Set[String]()
- partitionToDeserializer.filter {
- case (partition, partDeserializer) =>
- def updateExistPathSetByPathPattern(pathPatternStr: String) {
- val pathPattern = new Path(pathPatternStr)
- val fs = pathPattern.getFileSystem(hadoopConf)
- val matches = fs.globStatus(pathPattern)
- matches.foreach(fileStatus => existPathSet += fileStatus.getPath.toString)
- }
- // convert /demo/data/year/month/day to /demo/data/*/*/*/
- def getPathPatternByPath(parNum: Int, tempPath: Path): String = {
- var path = tempPath
- for (i <- (1 to parNum)) path = path.getParent
- val tails = (1 to parNum).map(_ => "*").mkString("/", "/", "/")
- path.toString + tails
- }
-
- val partPath = partition.getDataLocation
- val partNum = Utilities.getPartitionDesc(partition).getPartSpec.size();
- var pathPatternStr = getPathPatternByPath(partNum, partPath)
- if (!pathPatternSet.contains(pathPatternStr)) {
- pathPatternSet += pathPatternStr
- updateExistPathSetByPathPattern(pathPatternStr)
- }
- existPathSet.contains(partPath.toString)
- }
+ partitionToDeserializer.filter {
+ case (partition, partDeserializer) =>
+ val partPath = partition.getDataLocation
+ val fs = partPath.getFileSystem(hadoopConf)
+ fs.exists(partPath)
--- End diff --
Each partition sending an RPC request to the NameNode can result in poor performance
---
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org