You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by ueshin <gi...@git.apache.org> on 2018/06/07 18:33:28 UTC
[GitHub] spark pull request #21050: [SPARK-23912][SQL]add array_distinct
Github user ueshin commented on a diff in the pull request:
https://github.com/apache/spark/pull/21050#discussion_r193841141
--- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala ---
@@ -1882,3 +1883,134 @@ case class ArrayRepeat(left: Expression, right: Expression)
}
}
+
+/**
+ * Removes duplicate values from the array.
+ */
+@ExpressionDescription(
+ usage = "_FUNC_(array) - Removes duplicate values from the array.",
+ examples = """
+ Examples:
+ > SELECT _FUNC_(array(1, 2, 3, null, 3));
+ [1,2,3,null]
+ """, since = "2.4.0")
+case class ArrayDistinct(child: Expression)
+ extends UnaryExpression with ExpectsInputTypes {
+
+ override def inputTypes: Seq[AbstractDataType] = Seq(ArrayType)
+
+ override def dataType: DataType = child.dataType
+
+ lazy val elementType: DataType = dataType.asInstanceOf[ArrayType].elementType
+
+ override def nullSafeEval(array: Any): Any = {
+ val elementType = child.dataType.asInstanceOf[ArrayType].elementType
+ val data = array.asInstanceOf[ArrayData].toArray[AnyRef](elementType).distinct
+ new GenericArrayData(data.asInstanceOf[Array[Any]])
+ }
+
+ override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+ nullSafeCodeGen(ctx, ev, (array) => {
+ val i = ctx.freshName("i")
+ val j = ctx.freshName("j")
+ val hs = ctx.freshName("hs")
+ val foundNullElement = ctx.freshName("foundNullElement")
+ val distinctArrayLen = ctx.freshName("distinctArrayLen")
+ val getValue = CodeGenerator.getValue(array, elementType, i)
+ val openHashSet = classOf[OpenHashSet[_]].getName
+ val classTag = s"scala.reflect.ClassTag$$.MODULE$$.Object()"
+ s"""
+ |int $distinctArrayLen = 0;
+ |boolean $foundNullElement = false;
+ |$openHashSet $hs = new $openHashSet($classTag);
+ |for (int $i = 0; $i < $array.numElements(); $i++) {
+ | if ($array.isNullAt($i)) {
+ | if (!($foundNullElement)) {
+ | $distinctArrayLen = $distinctArrayLen + 1;
+ | $foundNullElement = true;
+ | }
+ | }
+ | else {
+ | if (!($hs.contains($getValue))) {
+ | $hs.add($getValue);
+ | $distinctArrayLen = $distinctArrayLen + 1;
+ | }
+ | }
+ |}
--- End diff --
Maybe we can skip some checks here for just counting the distinct array length, such as `if (!($foundNullElement))` or `if (!($hs.contains($getValue)))`.
We can simply do `$foundNullElement = true` if null found, otherwise `$hs.add($getValue)`, and the length will be `$hs.size() + ($foundNullElement ? 1 : 0)`.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org