You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2020/10/06 03:15:30 UTC
[spark] branch branch-2.4 updated: [SPARK-30201][SQL][2.4]
HiveOutputWriter standardOI should use ObjectInspectorCopyOption.DEFAULT
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-2.4 by this push:
new 1a77846 [SPARK-30201][SQL][2.4] HiveOutputWriter standardOI should use ObjectInspectorCopyOption.DEFAULT
1a77846 is described below
commit 1a77846775a62f48661c39c0f00914524d2e7014
Author: ulysses <yo...@weidian.com>
AuthorDate: Mon Oct 5 20:13:04 2020 -0700
[SPARK-30201][SQL][2.4] HiveOutputWriter standardOI should use ObjectInspectorCopyOption.DEFAULT
### What changes were proposed in this pull request?
This is a backport of #26831.
Now spark use `ObjectInspectorCopyOption.JAVA` as oi option which will convert any string to UTF-8 string. When write non UTF-8 code data, then `EFBFBD` will appear.
We should use `ObjectInspectorCopyOption.DEFAULT` to support pass the bytes.
### Why are the changes needed?
Here is the way to reproduce:
1. make a file contains 16 radix 'AABBCC' which is not the UTF-8 code.
2. create table test1 (c string) location '$file_path';
3. select hex(c) from test1; // AABBCC
4. craete table test2 (c string) as select c from test1;
5. select hex(c) from test2; // EFBFBDEFBFBDEFBFBD
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Pass the CI.
Closes #29948 from anuragmantri/SPARK-30201-2.4.
Authored-by: ulysses <yo...@weidian.com>
Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
.../org/apache/spark/sql/hive/HiveInspectors.scala | 9 ++++++--
.../spark/sql/hive/execution/HiveFileFormat.scala | 7 ++++++-
.../org/apache/spark/sql/hive/InsertSuite.scala | 24 ++++++++++++++++++++++
3 files changed, 37 insertions(+), 3 deletions(-)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 4dec2f7..65c6fcc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -304,12 +304,17 @@ private[hive] trait HiveInspectors {
withNullSafe(o => getByteWritable(o))
case _: ByteObjectInspector =>
withNullSafe(o => o.asInstanceOf[java.lang.Byte])
- case _: JavaHiveVarcharObjectInspector =>
+ // To spark HiveVarchar and HiveChar are same as string
+ case _: HiveVarcharObjectInspector if x.preferWritable() =>
+ withNullSafe(o => getStringWritable(o))
+ case _: HiveVarcharObjectInspector =>
withNullSafe { o =>
val s = o.asInstanceOf[UTF8String].toString
new HiveVarchar(s, s.length)
}
- case _: JavaHiveCharObjectInspector =>
+ case _: HiveCharObjectInspector if x.preferWritable() =>
+ withNullSafe(o => getStringWritable(o))
+ case _: HiveCharObjectInspector =>
withNullSafe { o =>
val s = o.asInstanceOf[UTF8String].toString
new HiveChar(s, s.length)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala
index 4a7cd69..293b693 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala
@@ -128,10 +128,15 @@ class HiveOutputWriter(
new Path(path),
Reporter.NULL)
+ /**
+ * Since SPARK-30201 ObjectInspectorCopyOption.JAVA change to ObjectInspectorCopyOption.DEFAULT.
+ * The reason is DEFAULT option can convert `UTF8String` to `Text` with bytes and
+ * we can compatible with non UTF-8 code bytes during write.
+ */
private val standardOI = ObjectInspectorUtils
.getStandardObjectInspector(
tableDesc.getDeserializer(jobConf).getObjectInspector,
- ObjectInspectorCopyOption.JAVA)
+ ObjectInspectorCopyOption.DEFAULT)
.asInstanceOf[StructObjectInspector]
private val fieldOIs =
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
index 510de3a..224a219 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.hive
import java.io.File
+import com.google.common.io.Files
import org.scalatest.BeforeAndAfter
import org.apache.spark.SparkException
@@ -785,4 +786,27 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
}
}
}
+
+ test("SPARK-30201 HiveOutputWriter standardOI should use ObjectInspectorCopyOption.DEFAULT") {
+ withTable("t1", "t2") {
+ withTempDir { dir =>
+ val file = new File(dir, "test.hex")
+ val hex = "AABBCC"
+ val bs = org.apache.commons.codec.binary.Hex.decodeHex(hex.toCharArray)
+ Files.write(bs, file)
+ val path = file.getParent
+ sql(s"create table t1 (c string) STORED AS TEXTFILE location '$path'")
+ checkAnswer(
+ sql("select hex(c) from t1"),
+ Row(hex)
+ )
+
+ sql("create table t2 as select c from t1")
+ checkAnswer(
+ sql("select hex(c) from t2"),
+ Row(hex)
+ )
+ }
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org