You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by ueshin <gi...@git.apache.org> on 2017/05/03 06:36:06 UTC

[GitHub] spark pull request #16781: [SPARK-12297][SQL] Hive compatibility for Parquet...

Github user ueshin commented on a diff in the pull request:

    https://github.com/apache/spark/pull/16781#discussion_r114477668
  
    --- Diff: sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala ---
    @@ -141,4 +152,373 @@ class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with TestHi
           Row(Seq(Row(1))),
           "ARRAY<STRUCT<array_element: INT>>")
       }
    +
    +  val testTimezones = Seq(
    +    "UTC" -> "UTC",
    +    "LA" -> "America/Los_Angeles",
    +    "Berlin" -> "Europe/Berlin"
    +  )
    +  // Check creating parquet tables with timestamps, writing data into them, and reading it back out
    +  // under a variety of conditions:
    +  // * tables with explicit tz and those without
    +  // * altering table properties directly
    +  // * variety of timezones, local & non-local
    +  val sessionTimezones = testTimezones.map(_._2).map(Some(_)) ++ Seq(None)
    +  sessionTimezones.foreach { sessionTzOpt =>
    +    val sparkSession = spark.newSession()
    +    sessionTzOpt.foreach { tz => sparkSession.conf.set(SQLConf.SESSION_LOCAL_TIMEZONE.key, tz) }
    +    testCreateWriteRead(sparkSession, "no_tz", None, sessionTzOpt)
    +    val localTz = TimeZone.getDefault.getID()
    +    testCreateWriteRead(sparkSession, "local", Some(localTz), sessionTzOpt)
    +    // check with a variety of timezones.  The unit tests currently are configured to always use
    +    // America/Los_Angeles, but even if they didn't, we'd be sure to cover a non-local timezone.
    +    testTimezones.foreach { case (tableName, zone) =>
    +      if (zone != localTz) {
    +        testCreateWriteRead(sparkSession, tableName, Some(zone), sessionTzOpt)
    +      }
    +    }
    +  }
    +
    +  private def testCreateWriteRead(
    +      sparkSession: SparkSession,
    +      baseTable: String,
    +      explicitTz: Option[String],
    +      sessionTzOpt: Option[String]): Unit = {
    +    testCreateAlterTablesWithTimezone(sparkSession, baseTable, explicitTz, sessionTzOpt)
    +    testWriteTablesWithTimezone(sparkSession, baseTable, explicitTz, sessionTzOpt)
    +    testReadTablesWithTimezone(sparkSession, baseTable, explicitTz, sessionTzOpt)
    +  }
    +
    +  private def checkHasTz(spark: SparkSession, table: String, tz: Option[String]): Unit = {
    +    val tableMetadata = spark.sessionState.catalog.getTableMetadata(TableIdentifier(table))
    +    assert(tableMetadata.properties.get(ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY) === tz)
    +  }
    +
    +  private def testCreateAlterTablesWithTimezone(
    +      spark: SparkSession,
    +      baseTable: String,
    +      explicitTz: Option[String],
    +      sessionTzOpt: Option[String]): Unit = {
    +    test(s"SPARK-12297: Create and Alter Parquet tables and timezones; explicitTz = $explicitTz; " +
    +      s"sessionTzOpt = $sessionTzOpt") {
    +      val key = ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY
    +      withTable(baseTable, s"like_$baseTable", s"select_$baseTable", s"partitioned_$baseTable") {
    +        // If we ever add a property to set the table timezone by default, defaultTz would change
    +        val defaultTz = None
    +        // check that created tables have correct TBLPROPERTIES
    +        val tblProperties = explicitTz.map {
    +          tz => raw"""TBLPROPERTIES ($key="$tz")"""
    +        }.getOrElse("")
    +        spark.sql(
    +          raw"""CREATE TABLE $baseTable (
    +                |  x int
    +                | )
    +                | STORED AS PARQUET
    +                | $tblProperties
    +            """.stripMargin)
    +        val expectedTableTz = explicitTz.orElse(defaultTz)
    +        checkHasTz(spark, baseTable, expectedTableTz)
    +        spark.sql(
    +          raw"""CREATE TABLE partitioned_$baseTable (
    +                |  x int
    +                | )
    +                | PARTITIONED BY (y int)
    +                | STORED AS PARQUET
    +                | $tblProperties
    +            """.stripMargin)
    +        checkHasTz(spark, s"partitioned_$baseTable", expectedTableTz)
    +        spark.sql(s"CREATE TABLE like_$baseTable LIKE $baseTable")
    +        checkHasTz(spark, s"like_$baseTable", expectedTableTz)
    +        spark.sql(
    +          raw"""CREATE TABLE select_$baseTable
    +                | STORED AS PARQUET
    +                | AS
    +                | SELECT * from $baseTable
    +            """.stripMargin)
    +        checkHasTz(spark, s"select_$baseTable", defaultTz)
    +
    +        // check alter table, setting, unsetting, resetting the property
    +        spark.sql(
    +          raw"""ALTER TABLE $baseTable SET TBLPROPERTIES ($key="America/Los_Angeles")""")
    +        checkHasTz(spark, baseTable, Some("America/Los_Angeles"))
    +        spark.sql(raw"""ALTER TABLE $baseTable SET TBLPROPERTIES ($key="UTC")""")
    +        checkHasTz(spark, baseTable, Some("UTC"))
    +        spark.sql(raw"""ALTER TABLE $baseTable UNSET TBLPROPERTIES ($key)""")
    +        checkHasTz(spark, baseTable, None)
    +        explicitTz.foreach { tz =>
    +          spark.sql(raw"""ALTER TABLE $baseTable SET TBLPROPERTIES ($key="$tz")""")
    +          checkHasTz(spark, baseTable, expectedTableTz)
    +        }
    +      }
    +    }
    +  }
    +
    +  val desiredTimestampStrings = Seq(
    +    "2015-12-31 22:49:59.123",
    +    "2015-12-31 23:50:59.123",
    +    "2016-01-01 00:39:59.123",
    +    "2016-01-01 01:29:59.123"
    +  )
    +  // We don't want to mess with timezones inside the tests themselves, since we use a shared
    +  // spark context, and then we might be prone to issues from lazy vals for timezones.  Instead,
    +  // we manually adjust the timezone just to determine what the desired millis (since epoch, in utc)
    +  // is for various "wall-clock" times in different timezones, and then we can compare against those
    +  // in our tests.
    +  val timestampTimezoneToMillis = {
    +    val originalTz = TimeZone.getDefault
    +    try {
    +      (for {
    --- End diff --
    
    Let's use `flatMap { .. map { ... } }`.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org