You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@drill.apache.org by "ASF GitHub Bot (JIRA)" <ji...@apache.org> on 2018/07/02 06:25:00 UTC

[jira] [Commented] (DRILL-6554) Minor code improvements in parquet statistics handling

    [ https://issues.apache.org/jira/browse/DRILL-6554?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16529427#comment-16529427 ] 

ASF GitHub Bot commented on DRILL-6554:
---------------------------------------

asfgit closed pull request #1349: DRILL-6554: Minor code improvements in parquet statistics handling
URL: https://github.com/apache/drill/pull/1349
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
index 547dc06704..42e6e0b6a4 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetIsPredicate.java
@@ -113,9 +113,9 @@ public boolean canDrop(RangeExprEvaluator<C> evaluator) {
    * IS TRUE predicate.
    */
   private static LogicalExpression createIsTruePredicate(LogicalExpression expr) {
-    return new ParquetIsPredicate<Boolean>(expr,
+    return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) ->
         //if max value is not true or if there are all nulls  -> canDrop
-        (exprStat, evaluator) -> !((BooleanStatistics)exprStat).getMax() || isAllNulls(exprStat, evaluator.getRowCount())
+        isAllNulls(exprStat, evaluator.getRowCount()) || exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
     );
   }
 
@@ -123,9 +123,9 @@ private static LogicalExpression createIsTruePredicate(LogicalExpression expr) {
    * IS FALSE predicate.
    */
   private static LogicalExpression createIsFalsePredicate(LogicalExpression expr) {
-    return new ParquetIsPredicate<Boolean>(expr,
+    return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) ->
         //if min value is not false or if there are all nulls  -> canDrop
-        (exprStat, evaluator) -> ((BooleanStatistics)exprStat).getMin() || isAllNulls(exprStat, evaluator.getRowCount())
+        isAllNulls(exprStat, evaluator.getRowCount()) || exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
     );
   }
 
@@ -133,9 +133,9 @@ private static LogicalExpression createIsFalsePredicate(LogicalExpression expr)
    * IS NOT TRUE predicate.
    */
   private static LogicalExpression createIsNotTruePredicate(LogicalExpression expr) {
-    return new ParquetIsPredicate<Boolean>(expr,
+    return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) ->
         //if min value is not false or if there are no nulls  -> canDrop
-        (exprStat, evaluator) -> ((BooleanStatistics)exprStat).getMin() && hasNoNulls(exprStat)
+        hasNoNulls(exprStat) && exprStat.hasNonNullValue() && ((BooleanStatistics) exprStat).getMin()
     );
   }
 
@@ -143,9 +143,9 @@ private static LogicalExpression createIsNotTruePredicate(LogicalExpression expr
    * IS NOT FALSE predicate.
    */
   private static LogicalExpression createIsNotFalsePredicate(LogicalExpression expr) {
-    return new ParquetIsPredicate<Boolean>(expr,
+    return new ParquetIsPredicate<Boolean>(expr, (exprStat, evaluator) ->
         //if max value is not true or if there are no nulls  -> canDrop
-        (exprStat, evaluator) -> !((BooleanStatistics)exprStat).getMax() && hasNoNulls(exprStat)
+        hasNoNulls(exprStat) && exprStat.hasNonNullValue() && !((BooleanStatistics) exprStat).getMax()
     );
   }
 
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetPredicatesHelper.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetPredicatesHelper.java
index f804a7b06f..de4df1f5b9 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetPredicatesHelper.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/stat/ParquetPredicatesHelper.java
@@ -17,6 +17,7 @@
  */
 package org.apache.drill.exec.expr.stat;
 
+import org.apache.parquet.Preconditions;
 import org.apache.parquet.column.statistics.Statistics;
 
 /**
@@ -28,7 +29,7 @@ private ParquetPredicatesHelper() {
 
   /**
    * @param stat statistics object
-   * @return true if the input stat object has valid statistics; false otherwise
+   * @return <tt>true</tt> if the input stat object has valid statistics; false otherwise
    */
   static boolean isNullOrEmpty(Statistics stat) {
     return stat == null || stat.isEmpty();
@@ -39,22 +40,21 @@ static boolean isNullOrEmpty(Statistics stat) {
    *
    * @param stat parquet column statistics
    * @param rowCount number of rows in the parquet file
-   * @return True if all rows are null in the parquet file
-   *          False if at least one row is not null.
+   * @return <tt>true</tt> if all rows are null in the parquet file and <tt>false</tt> otherwise
    */
   static boolean isAllNulls(Statistics stat, long rowCount) {
-    return stat.isNumNullsSet() && stat.getNumNulls() == rowCount;
+    Preconditions.checkArgument(rowCount >= 0, String.format("negative rowCount %d is not valid", rowCount));
+    return stat.getNumNulls() == rowCount;
   }
 
   /**
-   * Checks that column chunk's statistics has at least one null
+   * Checks that column chunk's statistics does not have nulls
    *
    * @param stat parquet column statistics
-   * @return True if the parquet file has nulls
-   *          False if the parquet file hasn't nulls.
+   * @return <tt>true</tt> if the parquet file does not have nulls and <tt>false</tt> otherwise
    */
   static boolean hasNoNulls(Statistics stat) {
-    return !stat.isNumNullsSet() || stat.getNumNulls() == 0;
+    return stat.getNumNulls() <= 0;
   }
 
 }
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java
index 40203f5989..a7f78fb40c 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java
@@ -32,7 +32,7 @@
 import org.apache.parquet.SemanticVersion;
 import org.apache.parquet.VersionParser;
 import org.apache.parquet.column.ColumnDescriptor;
-import org.apache.parquet.column.statistics.Statistics;
+import org.apache.parquet.column.statistics.IntStatistics;
 import org.apache.parquet.format.ConvertedType;
 import org.apache.parquet.format.FileMetaData;
 import org.apache.parquet.format.SchemaElement;
@@ -417,16 +417,9 @@ public static DateCorruptionStatus checkForCorruptDateValuesInStatistics(Parquet
             // column does not appear in this file, skip it
             continue;
           }
-          Statistics statistics = footer.getBlocks().get(rowGroupIndex).getColumns().get(colIndex).getStatistics();
-          Integer max = (Integer) statistics.genericGetMax();
-          if (statistics.hasNonNullValue()) {
-            if (max > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
-              return DateCorruptionStatus.META_SHOWS_CORRUPTION;
-            }
-          } else {
-            // no statistics, go check the first page
-            return DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
-          }
+          IntStatistics statistics = (IntStatistics) footer.getBlocks().get(rowGroupIndex).getColumns().get(colIndex).getStatistics();
+          return (statistics.hasNonNullValue() && statistics.compareMaxToValue(ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) > 0) ?
+              DateCorruptionStatus.META_SHOWS_CORRUPTION : DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
         }
       }
     }
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/metadata/Metadata.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/metadata/Metadata.java
index a61cc18328..225916999d 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/metadata/Metadata.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/metadata/Metadata.java
@@ -473,7 +473,7 @@ private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3
           // Write stats when they are not null
           Object minValue = null;
           Object maxValue = null;
-          if (stats.genericGetMax() != null && stats.genericGetMin() != null ) {
+          if (stats.hasNonNullValue()) {
             minValue = stats.genericGetMin();
             maxValue = stats.genericGetMax();
             if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> Minor code improvements in parquet statistics handling
> ------------------------------------------------------
>
>                 Key: DRILL-6554
>                 URL: https://issues.apache.org/jira/browse/DRILL-6554
>             Project: Apache Drill
>          Issue Type: Improvement
>            Reporter: Vlad Rozov
>            Assignee: Vlad Rozov
>            Priority: Minor
>              Labels: ready-to-commit
>             Fix For: 1.14.0
>
>
> Avoid setting statistics in Metadata when min and max are not defined and few other minor code improvements.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)