You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2016/01/19 06:42:13 UTC
spark git commit: [SPARK-12668][SQL] Providing aliases for CSV options to be similar to Pandas and R

Repository: spark
Updated Branches:
  refs/heads/master 74ba84b64 -> 453dae567


[SPARK-12668][SQL] Providing aliases for CSV options to be similar to Pandas and R

https://issues.apache.org/jira/browse/SPARK-12668

Spark CSV datasource has been being merged (filed in [SPARK-12420](https://issues.apache.org/jira/browse/SPARK-12420)). This is a quicky PR that simply renames several CSV options to  similar Pandas and R.

- Alias for delimiter -> sep
- charset -> encoding

Author: hyukjinkwon <gu...@gmail.com>

Closes #10800 from HyukjinKwon/SPARK-12668.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/453dae56
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/453dae56
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/453dae56

Branch: refs/heads/master
Commit: 453dae56716bc254bf5022fddc9b8327c9b1a49f
Parents: 74ba84b
Author: hyukjinkwon <gu...@gmail.com>
Authored: Mon Jan 18 21:42:07 2016 -0800
Committer: Reynold Xin <rx...@databricks.com>
Committed: Mon Jan 18 21:42:07 2016 -0800

----------------------------------------------------------------------
 .../execution/datasources/csv/CSVParameters.scala    |  8 +++++---
 .../sql/execution/datasources/csv/CSVRelation.scala  |  3 ++-
 .../sql/execution/datasources/csv/CSVSuite.scala     | 15 +++++++++++++--
 3 files changed, 20 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/453dae56/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParameters.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParameters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParameters.scala
index ec16bdb..127c972 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParameters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParameters.scala
@@ -21,7 +21,7 @@ import java.nio.charset.Charset
 
 import org.apache.spark.Logging
 
-private[sql] case class CSVParameters(parameters: Map[String, String]) extends Logging {
+private[sql] case class CSVParameters(@transient parameters: Map[String, String]) extends Logging {
 
   private def getChar(paramName: String, default: Char): Char = {
     val paramValue = parameters.get(paramName)
@@ -44,9 +44,11 @@ private[sql] case class CSVParameters(parameters: Map[String, String]) extends L
     }
   }
 
-  val delimiter = CSVTypeCast.toChar(parameters.getOrElse("delimiter", ","))
+  val delimiter = CSVTypeCast.toChar(
+    parameters.getOrElse("sep", parameters.getOrElse("delimiter", ",")))
   val parseMode = parameters.getOrElse("mode", "PERMISSIVE")
-  val charset = parameters.getOrElse("charset", Charset.forName("UTF-8").name())
+  val charset = parameters.getOrElse("encoding",
+    parameters.getOrElse("charset", Charset.forName("UTF-8").name()))
 
   val quote = getChar("quote", '\"')
   val escape = getChar("escape", '\\')

http://git-wip-us.apache.org/repos/asf/spark/blob/453dae56/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
index 9267479..5381885 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
@@ -58,9 +58,10 @@ private[csv] class CSVRelation(
     if (Charset.forName(params.charset) == Charset.forName("UTF-8")) {
       sqlContext.sparkContext.textFile(location)
     } else {
+      val charset = params.charset
       sqlContext.sparkContext.hadoopFile[LongWritable, Text, TextInputFormat](location)
         .mapPartitions { _.map { pair =>
-            new String(pair._2.getBytes, 0, pair._2.getLength, params.charset)
+            new String(pair._2.getBytes, 0, pair._2.getLength, charset)
           }
         }
     }

http://git-wip-us.apache.org/repos/asf/spark/blob/453dae56/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 8fdd31a..071b5ef 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -122,7 +122,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
     assert(exception.getMessage.contains("1-9588-osi"))
   }
 
-  ignore("test different encoding") {
+  test("test different encoding") {
     // scalastyle:off
     sqlContext.sql(
       s"""
@@ -135,6 +135,18 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
     verifyCars(sqlContext.table("carsTable"), withHeader = true)
   }
 
+  test("test aliases sep and encoding for delimiter and charset") {
+    val cars = sqlContext
+      .read
+      .format("csv")
+      .option("header", "true")
+      .option("encoding", "iso-8859-1")
+      .option("sep", "þ")
+      .load(testFile(carsFile8859))
+
+    verifyCars(cars, withHeader = true)
+  }
+
   test("DDL test with tab separated file") {
     sqlContext.sql(
       s"""
@@ -337,5 +349,4 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
     assert(results(0).toSeq === Array(2012, "Tesla", "S", "null", "null"))
     assert(results(2).toSeq === Array(null, "Chevy", "Volt", null, null))
   }
-
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org