You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by "sandip-db (via GitHub)" <gi...@apache.org> on 2024/01/03 22:01:17 UTC

Re: [PR] [SPARK-46382][SQL] XML: Refactor the handling of values interspersed between elements [spark]

sandip-db commented on code in PR #44571:
URL: https://github.com/apache/spark/pull/44571#discussion_r1440997050


##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala:
##########
@@ -235,22 +227,6 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: Boolean)
     val nameToDataType =
       collection.mutable.TreeMap.empty[String, DataType](caseSensitivityOrdering)
 

Review Comment:
   Define `firstEventIsStartElement` here:
   
   `val firstEventIsStartElement = parser.peek().isStartElement`



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala:
##########
@@ -288,16 +265,17 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: Boolean)
             case dt: DataType => dt
           }
           // Add the field and datatypes so that we can check if this is ArrayType.
-          val field = StaxXmlParserUtils.getName(e.asStartElement.getName, options)
           addOrUpdateType(nameToDataType, field, inferredType)
 
         case c: Characters if !c.isWhiteSpace =>
-          // This can be an attribute-only object
+          // This can be a value tag
           val valueTagType = inferFrom(c.getData)
           addOrUpdateType(nameToDataType, options.valueTag, valueTagType)
 
-        case _: EndElement =>
-          shouldStop = inferAndCheckEndElement(parser)
+        case e: EndElement =>
+          // In case of corrupt records, we shouldn't read beyond the EndDocument
+          shouldStop = parser.peek().isInstanceOf[EndDocument] || StaxXmlParserUtils
+              .getName(e.getName, options) == startElementName

Review Comment:
   ```suggestion
             shouldStop = if (firstEventIsStartElement) {
                          StaxXmlParserUtils.checkEndElement(parser)
                         } else {
                           true
                         }
   ```



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala:
##########
@@ -211,6 +202,7 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: Boolean)
    */
   private def inferObject(
       parser: XMLEventReader,
+      startElementName: String,

Review Comment:
   This won't be necessary after the change below.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org