You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2019/07/08 21:11:22 UTC

[GitHub] [spark] hvanhovell commented on a change in pull request #25024: [SPARK-27296][SQL] User Defined Aggregators that do not ser/de on each input row

hvanhovell commented on a change in pull request #25024: [SPARK-27296][SQL] User Defined Aggregators that do not ser/de on each input row
URL: https://github.com/apache/spark/pull/25024#discussion_r301301629
 
 

 ##########
 File path: sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
 ##########
 @@ -450,3 +451,76 @@ case class ScalaUDAF(
 
   override def nodeName: String = udaf.getClass.getSimpleName
 }
+
+/**
+ * The internal wrapper used to hook a [[UserDefinedImperativeAggregator]] `udia` in the
+ * internal aggregation code path.
+ */
+case class ScalaUDIA[T](
+    children: Seq[Expression],
+    udia: UserDefinedImperativeAggregator[T],
+    mutableAggBufferOffset: Int = 0,
+    inputAggBufferOffset: Int = 0)
+  extends TypedImperativeAggregate[T]
+  with NonSQLExpression
+  with UserDefinedExpression
+  with ImplicitCastInputTypes
+  with Logging {
+
+  def dataType: DataType = udia.resultType
+
+  val inputTypes: Seq[DataType] = udia.inputSchema.map(_.dataType)
+
+  def nullable: Boolean = true
+
+  override lazy val deterministic: Boolean = udia.deterministic
+
+  def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ScalaUDIA[T] =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ScalaUDIA[T] =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
+
+  private[this] lazy val childrenSchema: StructType = {
+    val inputFields = children.zipWithIndex.map {
+      case (child, index) =>
+        StructField(s"input$index", child.dataType, child.nullable, Metadata.empty)
+    }
+    StructType(inputFields)
+  }
+
+  private lazy val inputProjection = {
+    val inputAttributes = childrenSchema.toAttributes
+    log.debug(
+      s"Creating MutableProj: $children, inputSchema: $inputAttributes.")
+    MutableProjection.create(children, inputAttributes)
+  }
+
+  private[this] lazy val inputToScalaConverters: Any => Any =
+    CatalystTypeConverters.createToScalaConverter(childrenSchema)
+
+  def createAggregationBuffer(): T = udia.initial
+
+  def update(buffer: T, input: InternalRow): T = {
+    val inrow = inputToScalaConverters(inputProjection(input)).asInstanceOf[Row]
 
 Review comment:
   You need to convert the internal row produced by the `inputProjection` to an 'external' Row which the UDAF understands. To answers Reynold's question, yes this does deserialize data. It might be better to use a RowEncoder here if we can.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org