You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "lostinoverflow (JIRA)" <ji...@apache.org> on 2016/11/01 22:33:58 UTC
[jira] [Updated] (SPARK-18211) Spark SQL ignores split.size
[ https://issues.apache.org/jira/browse/SPARK-18211?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
lostinoverflow updated SPARK-18211:
-----------------------------------
Description:
I expect that RDD and DataFrame will have the same number of partitions (worked in 1.6) but it looks like Spark SQL ignores Hadoop configuration.
{code}
import org.apache.spark.sql.SparkSession
object App {
def main(args: Array[String]) {
val spark = SparkSession
.builder()
.master("local[*]")
.appName("split size")
.getOrCreate()
spark.sparkContext.hadoopConfiguration.setInt("mapred.min.split.size", args(0).toInt)
spark.sparkContext.hadoopConfiguration.setInt("mapred.max.split.size", args(0).toInt)
println(spark.sparkContext.textFile(args(1)).partitions.size)
println(spark.read.textFile(args(1)).rdd.partitions.size)
spark.stop()
}
}
{code}
was:
I expect that RDD and DataFrame will have the same number of partitions (worked in 1.6) but it looks like Spark SQL ignores Hadoop configuration.
import org.apache.spark.sql.SparkSession
object App {
def main(args: Array[String]) {
val spark = SparkSession
.builder()
.master("local[*]")
.appName("split size")
.getOrCreate()
spark.sparkContext.hadoopConfiguration.setInt("mapred.min.split.size", args(0).toInt)
spark.sparkContext.hadoopConfiguration.setInt("mapred.max.split.size", args(0).toInt)
println(spark.sparkContext.textFile(args(1)).partitions.size)
println(spark.read.textFile(args(1)).rdd.partitions.size)
spark.stop()
}
}
> Spark SQL ignores split.size
> ----------------------------
>
> Key: SPARK-18211
> URL: https://issues.apache.org/jira/browse/SPARK-18211
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 2.0.0
> Reporter: lostinoverflow
>
> I expect that RDD and DataFrame will have the same number of partitions (worked in 1.6) but it looks like Spark SQL ignores Hadoop configuration.
> {code}
> import org.apache.spark.sql.SparkSession
> object App {
> def main(args: Array[String]) {
> val spark = SparkSession
> .builder()
> .master("local[*]")
> .appName("split size")
> .getOrCreate()
> spark.sparkContext.hadoopConfiguration.setInt("mapred.min.split.size", args(0).toInt)
> spark.sparkContext.hadoopConfiguration.setInt("mapred.max.split.size", args(0).toInt)
> println(spark.sparkContext.textFile(args(1)).partitions.size)
> println(spark.read.textFile(args(1)).rdd.partitions.size)
> spark.stop()
> }
> }
> {code}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org