You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by ja...@apache.org on 2014/11/13 18:18:50 UTC

[06/10] incubator-drill git commit: DRILL-1434: In ParquetGroupScan compute the non-null value count of a column if stats are available for each chunk. Don't apply ConvertCountToDirectScan rule if stats are not available.

DRILL-1434: In ParquetGroupScan compute the non-null value count of a column if stats are available for each chunk. Don't apply ConvertCountToDirectScan rule if stats are not available.


Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/3c3b3d55
Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/3c3b3d55
Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/3c3b3d55

Branch: refs/heads/master
Commit: 3c3b3d55d0b8b71100b2df55d482e89d0f7f0f9c
Parents: 8695cdf
Author: Aman Sinha <as...@maprtech.com>
Authored: Sun Nov 9 18:18:47 2014 -0800
Committer: Jacques Nadeau <ja...@apache.org>
Committed: Thu Nov 13 09:17:27 2014 -0800

----------------------------------------------------------------------
 .../drill/exec/physical/base/GroupScan.java     |  1 +
 .../physical/ConvertCountToDirectScan.java      |  4 ++++
 .../exec/store/parquet/ParquetGroupScan.java    | 22 ++++++++++++++++++--
 3 files changed, 25 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/3c3b3d55/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/GroupScan.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/GroupScan.java b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/GroupScan.java
index 2f94995..3e5e408 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/GroupScan.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/GroupScan.java
@@ -34,6 +34,7 @@ import com.google.common.collect.Lists;
 public interface GroupScan extends Scan, HasAffinity{
 
   public static final List<SchemaPath> ALL_COLUMNS = Lists.<SchemaPath>newArrayList(SchemaPath.getSimplePath("*"));
+  public static final long NO_COLUMN_STATS = -1;
 
   public abstract void applyAssignments(List<DrillbitEndpoint> endpoints) throws PhysicalOperatorSetupException;
 

http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/3c3b3d55/exec/java-exec/src/main/java/org/apache/drill/exec/planner/physical/ConvertCountToDirectScan.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/physical/ConvertCountToDirectScan.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/physical/ConvertCountToDirectScan.java
index 620cf1f..d794805 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/physical/ConvertCountToDirectScan.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/physical/ConvertCountToDirectScan.java
@@ -119,6 +119,10 @@ public class ConvertCountToDirectScan extends Prule {
         String columnName = scan.getRowType().getFieldNames().get(index).toLowerCase();
 
         cnt = oldGrpScan.getColumnValueCount(SchemaPath.getSimplePath(columnName));
+        if (cnt == GroupScan.NO_COLUMN_STATS) {
+          // if column stats are not available don't apply this rule
+          return;
+        }
       } else {
         return; // do nothing.
       }

http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/3c3b3d55/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java
index dab20e3..7882b66 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java
@@ -227,8 +227,26 @@ public class ParquetGroupScan extends AbstractFileGroupScan {
             valueCountInGrp = Math.max(col.getValueCount(), valueCountInGrp);
             SchemaPath path = SchemaPath.getSimplePath(col.getPath().toString().replace("[", "").replace("]", "").toLowerCase());
 
-            long valueCount = columnValueCounts.containsKey(path) ? columnValueCounts.get(path) : 0;
-            columnValueCounts.put(path, valueCount + col.getValueCount());
+            long previousCount = 0;
+            long currentCount = 0;
+
+            if (! columnValueCounts.containsKey(path)) {
+              // create an entry for this column
+              columnValueCounts.put(path, previousCount /* initialize to 0 */);
+            } else {
+              previousCount = columnValueCounts.get(path);
+            }
+
+            boolean statsAvail = (col.getStatistics() != null && !col.getStatistics().isEmpty());
+
+            if (statsAvail && previousCount != GroupScan.NO_COLUMN_STATS) {
+              currentCount = col.getValueCount() - col.getStatistics().getNumNulls(); // only count non-nulls
+              columnValueCounts.put(path, previousCount + currentCount);
+            } else {
+              // even if 1 chunk does not have stats, we cannot rely on the value count for this column
+              columnValueCounts.put(path, GroupScan.NO_COLUMN_STATS);
+            }
+
           }
 
           String filePath = footer.getFile().toUri().getPath();