You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by ja...@apache.org on 2014/11/13 18:18:50 UTC
[06/10] incubator-drill git commit: DRILL-1434: In ParquetGroupScan
compute the non-null value count of a column if stats are available for each
chunk. Don't apply ConvertCountToDirectScan rule if stats are not available.
DRILL-1434: In ParquetGroupScan compute the non-null value count of a column if stats are available for each chunk. Don't apply ConvertCountToDirectScan rule if stats are not available.
Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/3c3b3d55
Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/3c3b3d55
Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/3c3b3d55
Branch: refs/heads/master
Commit: 3c3b3d55d0b8b71100b2df55d482e89d0f7f0f9c
Parents: 8695cdf
Author: Aman Sinha <as...@maprtech.com>
Authored: Sun Nov 9 18:18:47 2014 -0800
Committer: Jacques Nadeau <ja...@apache.org>
Committed: Thu Nov 13 09:17:27 2014 -0800
----------------------------------------------------------------------
.../drill/exec/physical/base/GroupScan.java | 1 +
.../physical/ConvertCountToDirectScan.java | 4 ++++
.../exec/store/parquet/ParquetGroupScan.java | 22 ++++++++++++++++++--
3 files changed, 25 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/3c3b3d55/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/GroupScan.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/GroupScan.java b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/GroupScan.java
index 2f94995..3e5e408 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/GroupScan.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/GroupScan.java
@@ -34,6 +34,7 @@ import com.google.common.collect.Lists;
public interface GroupScan extends Scan, HasAffinity{
public static final List<SchemaPath> ALL_COLUMNS = Lists.<SchemaPath>newArrayList(SchemaPath.getSimplePath("*"));
+ public static final long NO_COLUMN_STATS = -1;
public abstract void applyAssignments(List<DrillbitEndpoint> endpoints) throws PhysicalOperatorSetupException;
http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/3c3b3d55/exec/java-exec/src/main/java/org/apache/drill/exec/planner/physical/ConvertCountToDirectScan.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/physical/ConvertCountToDirectScan.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/physical/ConvertCountToDirectScan.java
index 620cf1f..d794805 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/physical/ConvertCountToDirectScan.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/physical/ConvertCountToDirectScan.java
@@ -119,6 +119,10 @@ public class ConvertCountToDirectScan extends Prule {
String columnName = scan.getRowType().getFieldNames().get(index).toLowerCase();
cnt = oldGrpScan.getColumnValueCount(SchemaPath.getSimplePath(columnName));
+ if (cnt == GroupScan.NO_COLUMN_STATS) {
+ // if column stats are not available don't apply this rule
+ return;
+ }
} else {
return; // do nothing.
}
http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/3c3b3d55/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java
index dab20e3..7882b66 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java
@@ -227,8 +227,26 @@ public class ParquetGroupScan extends AbstractFileGroupScan {
valueCountInGrp = Math.max(col.getValueCount(), valueCountInGrp);
SchemaPath path = SchemaPath.getSimplePath(col.getPath().toString().replace("[", "").replace("]", "").toLowerCase());
- long valueCount = columnValueCounts.containsKey(path) ? columnValueCounts.get(path) : 0;
- columnValueCounts.put(path, valueCount + col.getValueCount());
+ long previousCount = 0;
+ long currentCount = 0;
+
+ if (! columnValueCounts.containsKey(path)) {
+ // create an entry for this column
+ columnValueCounts.put(path, previousCount /* initialize to 0 */);
+ } else {
+ previousCount = columnValueCounts.get(path);
+ }
+
+ boolean statsAvail = (col.getStatistics() != null && !col.getStatistics().isEmpty());
+
+ if (statsAvail && previousCount != GroupScan.NO_COLUMN_STATS) {
+ currentCount = col.getValueCount() - col.getStatistics().getNumNulls(); // only count non-nulls
+ columnValueCounts.put(path, previousCount + currentCount);
+ } else {
+ // even if 1 chunk does not have stats, we cannot rely on the value count for this column
+ columnValueCounts.put(path, GroupScan.NO_COLUMN_STATS);
+ }
+
}
String filePath = footer.getFile().toUri().getPath();