You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by "cloud-fan (via GitHub)" <gi...@apache.org> on 2023/08/10 04:41:50 UTC

[GitHub] [spark] cloud-fan commented on a diff in pull request #42424: [SPARK-43380][SQL] Fix Avro data type conversion issues without causing performance regression

cloud-fan commented on code in PR #42424:
URL: https://github.com/apache/spark/pull/42424#discussion_r1289540318


##########
connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala:
##########
@@ -118,268 +118,225 @@ private[sql] class AvroDeserializer(
     val incompatibleMsg = errorPrefix +
         s"schema is incompatible (avroType = $avroType, sqlType = ${catalystType.sql})"
 
+    val realDataType = SchemaConverters.toSqlType(avroType).dataType
     val confKey = SQLConf.LEGACY_AVRO_ALLOW_INCOMPATIBLE_SCHEMA
     val preventReadingIncorrectType = !SQLConf.get.getConf(confKey)
 
-    val logicalDataType = SchemaConverters.toSqlType(avroType).dataType
-    avroType.getType match {
-      case NULL =>
-        (logicalDataType, catalystType) match {
-          case (_, NullType) => (updater, ordinal, _) =>
-            updater.setNullAt(ordinal)
-          case _ => throw new IncompatibleSchemaException(incompatibleMsg)
-        }
+    (avroType.getType, catalystType) match {
+      case (NULL, NullType) => (updater, ordinal, _) =>
+        updater.setNullAt(ordinal)
+
       // TODO: we can avoid boxing if future version of avro provide primitive accessors.
-      case BOOLEAN =>
-        (logicalDataType, catalystType) match {
-          case (_, BooleanType) => (updater, ordinal, value) =>
-            updater.setBoolean(ordinal, value.asInstanceOf[Boolean])
-          case _ => throw new IncompatibleSchemaException(incompatibleMsg)
-        }
+      case (BOOLEAN, BooleanType) => (updater, ordinal, value) =>
+        updater.setBoolean(ordinal, value.asInstanceOf[Boolean])
+
+      case (INT, IntegerType) => (updater, ordinal, value) =>
+        updater.setInt(ordinal, value.asInstanceOf[Int])
+
+      case (LONG, dt: TimestampType)
+        if preventReadingIncorrectType && realDataType.isInstanceOf[DayTimeIntervalType] =>
+        throw QueryCompilationErrors.avroIncorrectTypeError(toFieldStr(avroPath),
+          toFieldStr(catalystPath), realDataType.catalogString, dt.catalogString, confKey.key)
+
+      case (LONG, dt: TimestampNTZType)
+        if preventReadingIncorrectType && realDataType.isInstanceOf[DayTimeIntervalType] =>
+        throw QueryCompilationErrors.avroIncorrectTypeError(toFieldStr(avroPath),
+          toFieldStr(catalystPath), realDataType.catalogString, dt.catalogString, confKey.key)
+
+      case (LONG, dt: DateType)
+        if preventReadingIncorrectType && realDataType.isInstanceOf[DayTimeIntervalType] =>
+        throw QueryCompilationErrors.avroIncorrectTypeError(toFieldStr(avroPath),
+          toFieldStr(catalystPath), realDataType.catalogString, dt.catalogString, confKey.key)
+
+      case (INT, dt: TimestampType)
+        if preventReadingIncorrectType && realDataType.isInstanceOf[YearMonthIntervalType] =>
+        throw QueryCompilationErrors.avroIncorrectTypeError(toFieldStr(avroPath),
+          toFieldStr(catalystPath), realDataType.catalogString, dt.catalogString, confKey.key)
+
+      case (INT, dt: TimestampNTZType)
+        if preventReadingIncorrectType && realDataType.isInstanceOf[YearMonthIntervalType] =>
+        throw QueryCompilationErrors.avroIncorrectTypeError(toFieldStr(avroPath),
+          toFieldStr(catalystPath), realDataType.catalogString, dt.catalogString, confKey.key)
+
+      case (INT, dt: DateType)
+        if preventReadingIncorrectType && realDataType.isInstanceOf[YearMonthIntervalType] =>
+        throw QueryCompilationErrors.avroIncorrectTypeError(toFieldStr(avroPath),
+          toFieldStr(catalystPath), realDataType.catalogString, dt.catalogString, confKey.key)
+
+      case (INT, DateType) => (updater, ordinal, value) =>

Review Comment:
   It seems the logic is also changed. Are you sure where the perf regression comes from? the code structure (pattern match) or some logic?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org