You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2018/12/13 14:01:45 UTC

[GitHub] kjmrknsn closed pull request #23307: [SPARK-26335][SQL] Add a property for Dataset#show not to care about wide characters when padding them

kjmrknsn closed pull request #23307: [SPARK-26335][SQL] Add a property for Dataset#show not to care about wide characters when padding them
URL: https://github.com/apache/spark/pull/23307
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index b4ea1ee950217..49c721873377b 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2822,6 +2822,19 @@ private[spark] object Utils extends Logging {
     if (str == null) 0 else str.length + fullWidthRegex.findAllIn(str).size
   }
 
+  /**
+   * Return a width of a given string.
+   *
+   * @param str a string
+   * @param halfWidth If it is set to true, the number of half widths of a given string will be
+   *                  returned.
+   *                  Otherwise, the number of characters of a given string will be returned.
+   * @return a width of a given string
+   */
+  def stringWidth(str: String, halfWidth: Boolean): Int = {
+    if (str == null) 0 else if (halfWidth) stringHalfWidth(str) else str.length
+  }
+
   def sanitizeDirName(str: String): String = {
     str.replaceAll("[ :/]", "-").replaceAll("[.${}'\"]", "_").toLowerCase(Locale.ROOT)
   }
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index b2ff1cce3eb0b..ea6c72d553543 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -1193,6 +1193,44 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     // scalastyle:on nonascii
   }
 
+   test("stringWidth") {
+    // scalastyle:off nonascii
+    assert(Utils.stringWidth(null, false) == 0)
+    assert(Utils.stringWidth(null, true) == 0)
+    assert(Utils.stringWidth("", false) == 0)
+    assert(Utils.stringWidth("", true) == 0)
+    assert(Utils.stringWidth("ab c", false) == 4)
+    assert(Utils.stringWidth("ab c", true) == 4)
+    assert(Utils.stringWidth("1098", false) == 4)
+    assert(Utils.stringWidth("1098", true) == 4)
+    assert(Utils.stringWidth("mø", false) == 2)
+    assert(Utils.stringWidth("mø", true) == 2)
+    assert(Utils.stringWidth("γύρ", false) == 3)
+    assert(Utils.stringWidth("γύρ", true) == 3)
+    assert(Utils.stringWidth("pê", false) == 2)
+    assert(Utils.stringWidth("pê", true) == 2)
+    assert(Utils.stringWidth("ー", false) == 1)
+    assert(Utils.stringWidth("ー", true) == 2)
+    assert(Utils.stringWidth("测", false) == 1)
+    assert(Utils.stringWidth("测", true) == 2)
+    assert(Utils.stringWidth("か", false) == 1)
+    assert(Utils.stringWidth("か", true) == 2)
+    assert(Utils.stringWidth("걸", false) == 1)
+    assert(Utils.stringWidth("걸", true) == 2)
+    assert(Utils.stringWidth("à", false) == 1)
+    assert(Utils.stringWidth("à", true) == 1)
+    assert(Utils.stringWidth("焼", false) == 1)
+    assert(Utils.stringWidth("焼", true) == 2)
+    assert(Utils.stringWidth("羍む", false) == 2)
+    assert(Utils.stringWidth("羍む", true) == 4)
+    assert(Utils.stringWidth("뺭ᾘ", false) == 2)
+    assert(Utils.stringWidth("뺭ᾘ", true) == 3)
+    assert(Utils.stringWidth("\u0967\u0968\u0969", false) == 3)
+    assert(Utils.stringWidth("\u0967\u0968\u0969", true) == 3)
+    // scalastyle:on nonascii
+  }
+
+
   test("trimExceptCRLF standalone") {
     val crlfSet = Set("\r", "\n")
     val nonPrintableButCRLF = (0 to 32).map(_.toChar.toString).toSet -- crlfSet
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 86e068bf632bd..3b4351560c061 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1635,6 +1635,18 @@ object SQLConf {
       "java.time.* packages are used for the same purpose.")
     .booleanConf
     .createWithDefault(false)
+
+  val DATASET_SHOW_HANDLE_FULL_WIDTH_CHARACTERS =
+    buildConf("spark.sql.dataset.show.handleFullWidthCharacters")
+      .doc("If it is set to true, a width of a full width character will be calculated as two " +
+        "half widths. That makes it easy for humans to view a result of " +
+        "`org.apache.spark.sql.Dataset#show`. On the other hand, that makes it impossible for " +
+        "programs to parse a result of `org.apache.spark.sql.Dataset#show` because each cell of " +
+        "a column has a different width depending on how many full width characters it has. " +
+        "So, it should be set to false if programs need to parse a result of " +
+        "`org.apache.spark.sql.Dataset#show`.")
+      .booleanConf
+      .createWithDefault(true)
 }
 
 /**
@@ -2062,6 +2074,9 @@ class SQLConf extends Serializable with Logging {
 
   def legacyTimeParserEnabled: Boolean = getConf(SQLConf.LEGACY_TIME_PARSER_ENABLED)
 
+  def datasetShowHandleFullWidthCharacters: Boolean =
+    getConf(SQLConf.DATASET_SHOW_HANDLE_FULL_WIDTH_CHARACTERS)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index a664c7338badb..ad7e268e4871f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -306,6 +306,8 @@ class Dataset[T] private[sql](
     // We set a minimum column width at '3'
     val minimumColWidth = 3
 
+    val handleFullWidthChars = sparkSession.sessionState.conf.datasetShowHandleFullWidthCharacters
+
     if (!vertical) {
       // Initialise the width of each column to a minimum value
       val colWidths = Array.fill(numCols)(minimumColWidth)
@@ -313,16 +315,21 @@ class Dataset[T] private[sql](
       // Compute the width of each column
       for (row <- rows) {
         for ((cell, i) <- row.zipWithIndex) {
-          colWidths(i) = math.max(colWidths(i), Utils.stringHalfWidth(cell))
+          colWidths(i) = math.max(colWidths(i), Utils.stringWidth(cell, handleFullWidthChars))
         }
       }
 
       val paddedRows = rows.map { row =>
         row.zipWithIndex.map { case (cell, i) =>
+          val colWidth = if (handleFullWidthChars) {
+            colWidths(i) - Utils.stringHalfWidth(cell) + cell.length
+          } else {
+            colWidths(i)
+          }
           if (truncate > 0) {
-            StringUtils.leftPad(cell, colWidths(i) - Utils.stringHalfWidth(cell) + cell.length)
+            StringUtils.leftPad(cell, colWidth)
           } else {
-            StringUtils.rightPad(cell, colWidths(i) - Utils.stringHalfWidth(cell) + cell.length)
+            StringUtils.rightPad(cell, colWidth)
           }
         }
       }
@@ -344,10 +351,10 @@ class Dataset[T] private[sql](
 
       // Compute the width of field name and data columns
       val fieldNameColWidth = fieldNames.foldLeft(minimumColWidth) { case (curMax, fieldName) =>
-        math.max(curMax, Utils.stringHalfWidth(fieldName))
+        math.max(curMax, Utils.stringWidth(fieldName, handleFullWidthChars))
       }
       val dataColWidth = dataRows.foldLeft(minimumColWidth) { case (curMax, row) =>
-        math.max(curMax, row.map(cell => Utils.stringHalfWidth(cell)).max)
+        math.max(curMax, row.map(cell => Utils.stringWidth(cell, handleFullWidthChars)).max)
       }
 
       dataRows.zipWithIndex.foreach { case (row, i) =>
@@ -357,9 +364,17 @@ class Dataset[T] private[sql](
         sb.append(rowHeader).append("\n")
         row.zipWithIndex.map { case (cell, j) =>
           val fieldName = StringUtils.rightPad(fieldNames(j),
-            fieldNameColWidth - Utils.stringHalfWidth(fieldNames(j)) + fieldNames(j).length)
+            if (handleFullWidthChars) {
+              fieldNameColWidth - Utils.stringHalfWidth(fieldNames(j)) + fieldNames(j).length
+            } else {
+              fieldNameColWidth
+            })
           val data = StringUtils.rightPad(cell,
-            dataColWidth - Utils.stringHalfWidth(cell) + cell.length)
+            if (handleFullWidthChars) {
+              dataColWidth - Utils.stringHalfWidth(cell) + cell.length
+            } else {
+              dataColWidth
+            })
           s" $fieldName | $data "
         }.addString(sb, "", "\n", "\n")
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 525c7cef39563..56872b1146660 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1028,6 +1028,58 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     checkShowString(ds, expected)
   }
 
+  test("SPARK-26335 Add a property for Dataset#show not to care about wide characters " +
+    "when padding them") {
+    withSQLConf(SQLConf.DATASET_SHOW_HANDLE_FULL_WIDTH_CHARACTERS.key -> "false") {
+      // scalastyle:off nonascii
+      val df = Seq(
+        (0, null, 1),
+        (0, "", 1),
+        (0, "ab c", 1),
+        (0, "1098", 1),
+        (0, "mø", 1),
+        (0, "γύρ", 1),
+        (0, "pê", 1),
+        (0, "ー", 1),
+        (0, "测", 1),
+        (0, "か", 1),
+        (0, "걸", 1),
+        (0, "à", 1),
+        (0, "焼", 1),
+        (0, "羍む", 1),
+        (0, "뺭ᾘ", 1),
+        (0, "\u0967\u0968\u0969", 1)
+      ).toDF("b", "a", "c")
+      // scalastyle:on nonascii
+      val ds = df.as[ClassData]
+      val expected =
+      // scalastyle:off nonascii
+        """+---+----+---+
+          ||  b|   a|  c|
+          |+---+----+---+
+          ||  0|null|  1|
+          ||  0|    |  1|
+          ||  0|ab c|  1|
+          ||  0|1098|  1|
+          ||  0|  mø|  1|
+          ||  0| γύρ|  1|
+          ||  0|  pê|  1|
+          ||  0|   ー|  1|
+          ||  0|   测|  1|
+          ||  0|   か|  1|
+          ||  0|   걸|  1|
+          ||  0|   à|  1|
+          ||  0|   焼|  1|
+          ||  0|  羍む|  1|
+          ||  0|  뺭ᾘ|  1|
+          ||  0| १२३|  1|
+          |+---+----+---+
+          |""".stripMargin
+      // scalastyle:on nonascii
+      checkShowString(ds, expected)
+    }
+  }
+
   test(
     "SPARK-15112: EmbedDeserializerInFilter should not optimize plan fragment that changes schema"
   ) {


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org