You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2018/01/31 16:24:51 UTC
spark git commit: revert [SPARK-22785][SQL] remove
ColumnVector.anyNullsSet
Repository: spark
Updated Branches:
refs/heads/master 3d0911bbe -> 48dd6a4c7
revert [SPARK-22785][SQL] remove ColumnVector.anyNullsSet
## What changes were proposed in this pull request?
In https://github.com/apache/spark/pull/19980 , we thought `anyNullsSet` can be simply implemented by `numNulls() > 0`. This is logically true, but may have performance problems.
`OrcColumnVector` is an example. It doesn't have the `numNulls` property, only has a `noNulls` property. We will lose a lot of performance if we use `numNulls() > 0` to check null.
This PR simply revert #19980, with a renaming to call it `hasNull`. Better name suggestions are welcome, e.g. `nullable`?
## How was this patch tested?
existing test
Author: Wenchen Fan <we...@databricks.com>
Closes #20452 from cloud-fan/null.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/48dd6a4c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/48dd6a4c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/48dd6a4c
Branch: refs/heads/master
Commit: 48dd6a4c79e33a8f2dba8349b58aa07e4796a925
Parents: 3d0911b
Author: Wenchen Fan <we...@databricks.com>
Authored: Thu Feb 1 00:24:42 2018 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Thu Feb 1 00:24:42 2018 +0800
----------------------------------------------------------------------
.../sql/execution/datasources/orc/OrcColumnVector.java | 5 +++++
.../sql/execution/vectorized/OffHeapColumnVector.java | 2 +-
.../sql/execution/vectorized/OnHeapColumnVector.java | 2 +-
.../sql/execution/vectorized/WritableColumnVector.java | 7 ++++++-
.../apache/spark/sql/vectorized/ArrowColumnVector.java | 5 +++++
.../org/apache/spark/sql/vectorized/ColumnVector.java | 5 +++++
.../execution/vectorized/ArrowColumnVectorSuite.scala | 12 ++++++++++++
.../sql/execution/vectorized/ColumnarBatchSuite.scala | 9 +++++++++
8 files changed, 44 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/48dd6a4c/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
----------------------------------------------------------------------
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
index 5078bc7..78203e3 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
@@ -78,6 +78,11 @@ public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVecto
}
@Override
+ public boolean hasNull() {
+ return !baseData.noNulls;
+ }
+
+ @Override
public int numNulls() {
if (baseData.isRepeating) {
if (baseData.isNull[0]) {
http://git-wip-us.apache.org/repos/asf/spark/blob/48dd6a4c/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
----------------------------------------------------------------------
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
index 1c45b84..fa52e4a 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
@@ -123,7 +123,7 @@ public final class OffHeapColumnVector extends WritableColumnVector {
@Override
public void putNotNulls(int rowId, int count) {
- if (numNulls == 0) return;
+ if (!hasNull()) return;
long offset = nulls + rowId;
for (int i = 0; i < count; ++i, ++offset) {
Platform.putByte(null, offset, (byte) 0);
http://git-wip-us.apache.org/repos/asf/spark/blob/48dd6a4c/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
----------------------------------------------------------------------
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
index 1d538fe..cccef78 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
@@ -119,7 +119,7 @@ public final class OnHeapColumnVector extends WritableColumnVector {
@Override
public void putNotNulls(int rowId, int count) {
- if (numNulls == 0) return;
+ if (!hasNull()) return;
for (int i = 0; i < count; ++i) {
nulls[rowId + i] = (byte)0;
}
http://git-wip-us.apache.org/repos/asf/spark/blob/48dd6a4c/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
----------------------------------------------------------------------
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index a8ec8ef..8ebc1ad 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -59,8 +59,8 @@ public abstract class WritableColumnVector extends ColumnVector {
elementsAppended = 0;
if (numNulls > 0) {
putNotNulls(0, capacity);
+ numNulls = 0;
}
- numNulls = 0;
}
@Override
@@ -103,6 +103,11 @@ public abstract class WritableColumnVector extends ColumnVector {
}
@Override
+ public boolean hasNull() {
+ return numNulls > 0;
+ }
+
+ @Override
public int numNulls() { return numNulls; }
/**
http://git-wip-us.apache.org/repos/asf/spark/blob/48dd6a4c/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
----------------------------------------------------------------------
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
index a75d76b..5ff6474 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
@@ -38,6 +38,11 @@ public final class ArrowColumnVector extends ColumnVector {
private ArrowColumnVector[] childColumns;
@Override
+ public boolean hasNull() {
+ return accessor.getNullCount() > 0;
+ }
+
+ @Override
public int numNulls() {
return accessor.getNullCount();
}
http://git-wip-us.apache.org/repos/asf/spark/blob/48dd6a4c/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
----------------------------------------------------------------------
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
index 111f5d9..d588956 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
@@ -66,6 +66,11 @@ public abstract class ColumnVector implements AutoCloseable {
public abstract void close();
/**
+ * Returns true if this column vector contains any null values.
+ */
+ public abstract boolean hasNull();
+
+ /**
* Returns the number of nulls in this column vector.
*/
public abstract int numNulls();
http://git-wip-us.apache.org/repos/asf/spark/blob/48dd6a4c/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
index e794f50..b55489c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
@@ -42,6 +42,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === BooleanType)
+ assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)
(0 until 10).foreach { i =>
@@ -69,6 +70,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === ByteType)
+ assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)
(0 until 10).foreach { i =>
@@ -96,6 +98,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === ShortType)
+ assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)
(0 until 10).foreach { i =>
@@ -123,6 +126,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === IntegerType)
+ assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)
(0 until 10).foreach { i =>
@@ -150,6 +154,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === LongType)
+ assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)
(0 until 10).foreach { i =>
@@ -177,6 +182,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === FloatType)
+ assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)
(0 until 10).foreach { i =>
@@ -204,6 +210,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === DoubleType)
+ assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)
(0 until 10).foreach { i =>
@@ -232,6 +239,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === StringType)
+ assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)
(0 until 10).foreach { i =>
@@ -258,6 +266,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === BinaryType)
+ assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)
(0 until 10).foreach { i =>
@@ -300,6 +309,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === ArrayType(IntegerType))
+ assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)
val array0 = columnVector.getArray(0)
@@ -344,6 +354,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === schema)
+ assert(!columnVector.hasNull)
assert(columnVector.numNulls === 0)
val row0 = columnVector.getStruct(0)
@@ -396,6 +407,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
val columnVector = new ArrowColumnVector(vector)
assert(columnVector.dataType === schema)
+ assert(columnVector.hasNull)
assert(columnVector.numNulls === 1)
val row0 = columnVector.getStruct(0)
http://git-wip-us.apache.org/repos/asf/spark/blob/48dd6a4c/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 925c101..168bc5e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -66,22 +66,27 @@ class ColumnarBatchSuite extends SparkFunSuite {
column =>
val reference = mutable.ArrayBuffer.empty[Boolean]
var idx = 0
+ assert(!column.hasNull)
assert(column.numNulls() == 0)
column.appendNotNull()
reference += false
+ assert(!column.hasNull)
assert(column.numNulls() == 0)
column.appendNotNulls(3)
(1 to 3).foreach(_ => reference += false)
+ assert(!column.hasNull)
assert(column.numNulls() == 0)
column.appendNull()
reference += true
+ assert(column.hasNull)
assert(column.numNulls() == 1)
column.appendNulls(3)
(1 to 3).foreach(_ => reference += true)
+ assert(column.hasNull)
assert(column.numNulls() == 4)
idx = column.elementsAppended
@@ -89,11 +94,13 @@ class ColumnarBatchSuite extends SparkFunSuite {
column.putNotNull(idx)
reference += false
idx += 1
+ assert(column.hasNull)
assert(column.numNulls() == 4)
column.putNull(idx)
reference += true
idx += 1
+ assert(column.hasNull)
assert(column.numNulls() == 5)
column.putNulls(idx, 3)
@@ -101,6 +108,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
reference += true
reference += true
idx += 3
+ assert(column.hasNull)
assert(column.numNulls() == 8)
column.putNotNulls(idx, 4)
@@ -109,6 +117,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
reference += false
reference += false
idx += 4
+ assert(column.hasNull)
assert(column.numNulls() == 8)
reference.zipWithIndex.foreach { v =>
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org