You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@drill.apache.org by GitBox <gi...@apache.org> on 2018/11/26 16:04:23 UTC

[GitHub] asfgit closed pull request #1548: DRILL-6857: Read only required row groups in a file when limit push down is applied

asfgit closed pull request #1548: DRILL-6857: Read only required row groups in a file when limit push down is applied
URL: https://github.com/apache/drill/pull/1548
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetGroupScan.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetGroupScan.java
index 9bc969f035b..0d35ddbdb42 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetGroupScan.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetGroupScan.java
@@ -246,7 +246,6 @@ public GroupScan applyFilter(LogicalExpression filterExpr, UdfUtilities udfUtili
     final Set<SchemaPath> schemaPathsInExpr = filterExpr.accept(new ParquetRGFilterEvaluator.FieldReferenceFinder(), null);
 
     final List<RowGroupInfo> qualifiedRGs = new ArrayList<>(rowGroupInfos.size());
-    Set<String> qualifiedFilePath = new HashSet<>(); // HashSet keeps a fileName unique.
 
     ParquetFilterPredicate filterPredicate = null;
 
@@ -289,17 +288,15 @@ public GroupScan applyFilter(LogicalExpression filterExpr, UdfUtilities udfUtili
       rowGroup.setRowsMatch(match);
 
       qualifiedRGs.add(rowGroup);
-      qualifiedFilePath.add(rowGroup.getPath());
     }
 
     if (qualifiedRGs.size() == rowGroupInfos.size() ) {
       // There is no reduction of rowGroups. Return the original groupScan.
-      logger.debug("applyFilter does not have any pruning!");
+      logger.debug("applyFilter() does not have any pruning!");
       return null;
-    } else if (qualifiedFilePath.size() == 0) {
-      logger.debug("All rowgroups have been filtered out. Add back one to get schema from scannner");
+    } else if (qualifiedRGs.size() == 0) {
+      logger.debug("All row groups have been filtered out. Add back one to get schema from scanner.");
       RowGroupInfo rg = rowGroupInfos.iterator().next();
-      qualifiedFilePath.add(rg.getPath());
       qualifiedRGs.add(rg);
     }
 
@@ -307,11 +304,7 @@ public GroupScan applyFilter(LogicalExpression filterExpr, UdfUtilities udfUtili
       ExpressionStringBuilder.toString(filterExpr), rowGroupInfos.size(), qualifiedRGs.size());
 
     try {
-      AbstractParquetGroupScan cloneGroupScan = cloneWithFileSelection(qualifiedFilePath);
-      cloneGroupScan.rowGroupInfos = qualifiedRGs;
-      cloneGroupScan.parquetGroupScanStatistics.collect(cloneGroupScan.rowGroupInfos, cloneGroupScan.parquetTableMetadata);
-      return cloneGroupScan;
-
+      return cloneWithRowGroupInfos(qualifiedRGs);
     } catch (IOException e) {
       logger.warn("Could not apply filter prune due to Exception : {}", e);
       return null;
@@ -330,29 +323,41 @@ public GroupScan applyLimit(int maxRecords) {
     maxRecords = Math.max(maxRecords, 1); // Make sure it request at least 1 row -> 1 rowGroup.
     // further optimization : minimize # of files chosen, or the affinity of files chosen.
 
+    if (parquetGroupScanStatistics.getRowCount() <= maxRecords) {
+      logger.debug("limit push down does not apply, since total number of rows [{}] is less or equal to the required [{}].",
+        parquetGroupScanStatistics.getRowCount(), maxRecords);
+      return null;
+    }
+
     // Calculate number of rowGroups to read based on maxRecords and update
     // number of records to read for each of those rowGroups.
-    int index = updateRowGroupInfo(maxRecords);
-
-    Set<String> filePaths = rowGroupInfos.subList(0, index).stream()
-        .map(ReadEntryWithPath::getPath)
-        .collect(Collectors.toSet()); // HashSet keeps a filePath unique.
+    List<RowGroupInfo> qualifiedRowGroupInfos = new ArrayList<>(rowGroupInfos.size());
+    int currentRowCount = 0;
+    for (RowGroupInfo rowGroupInfo : rowGroupInfos) {
+      long rowCount = rowGroupInfo.getRowCount();
+      if (currentRowCount + rowCount <= maxRecords) {
+        currentRowCount += rowCount;
+        rowGroupInfo.setNumRecordsToRead(rowCount);
+        qualifiedRowGroupInfos.add(rowGroupInfo);
+        continue;
+      } else if (currentRowCount < maxRecords) {
+        rowGroupInfo.setNumRecordsToRead(maxRecords - currentRowCount);
+        qualifiedRowGroupInfos.add(rowGroupInfo);
+      }
+      break;
+    }
 
-    // If there is no change in fileSet, no need to create new groupScan.
-    if (filePaths.size() == fileSet.size() ) {
-      // There is no reduction of rowGroups. Return the original groupScan.
-      logger.debug("applyLimit() does not apply!");
+    if (rowGroupInfos.size() == qualifiedRowGroupInfos.size()) {
+      logger.debug("limit push down does not apply, since number of row groups was not reduced.");
       return null;
     }
 
-    logger.debug("applyLimit() reduce parquet file # from {} to {}", fileSet.size(), filePaths.size());
+    logger.debug("applyLimit() reduce parquet row groups # from {} to {}.", rowGroupInfos.size(), qualifiedRowGroupInfos.size());
 
     try {
-      AbstractParquetGroupScan newScan = cloneWithFileSelection(filePaths);
-      newScan.updateRowGroupInfo(maxRecords);
-      return newScan;
+      return cloneWithRowGroupInfos(qualifiedRowGroupInfos);
     } catch (IOException e) {
-      logger.warn("Could not apply rowcount based prune due to Exception : {}", e);
+      logger.warn("Could not apply row count based prune due to Exception: {}", e);
       return null;
     }
   }
@@ -454,30 +459,22 @@ protected String getFilterString() {
 
   // private methods block start
   /**
-   * Based on maxRecords to read for the scan,
-   * figure out how many rowGroups to read
-   * and update number of records to read for each of them.
+   * Clones current group scan with set of file paths from given row groups,
+   * updates new scan with list of given row groups,
+   * re-calculates statistics and endpoint affinities.
    *
-   * @param maxRecords max records to read
-   * @return total number of rowGroups to read
+   * @param rowGroupInfos list of row group infos
+   * @return new parquet group scan
    */
-  private int updateRowGroupInfo(int maxRecords) {
-    long count = 0;
-    int index = 0;
-    for (RowGroupInfo rowGroupInfo : rowGroupInfos) {
-      long rowCount = rowGroupInfo.getRowCount();
-      if (count + rowCount <= maxRecords) {
-        count += rowCount;
-        rowGroupInfo.setNumRecordsToRead(rowCount);
-        index++;
-        continue;
-      } else if (count < maxRecords) {
-        rowGroupInfo.setNumRecordsToRead(maxRecords - count);
-        index++;
-      }
-      break;
-    }
-    return index;
+  private AbstractParquetGroupScan cloneWithRowGroupInfos(List<RowGroupInfo> rowGroupInfos) throws IOException {
+    Set<String> filePaths = rowGroupInfos.stream()
+      .map(ReadEntryWithPath::getPath)
+      .collect(Collectors.toSet()); // set keeps file names unique
+    AbstractParquetGroupScan scan = cloneWithFileSelection(filePaths);
+    scan.rowGroupInfos = rowGroupInfos;
+    scan.parquetGroupScanStatistics.collect(scan.rowGroupInfos, scan.parquetTableMetadata);
+    scan.endpointAffinities = AffinityCreator.getAffinityMap(scan.rowGroupInfos);
+    return scan;
   }
   // private methods block end
 
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetLimitPushDown.java b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetLimitPushDown.java
new file mode 100644
index 00000000000..774979604ea
--- /dev/null
+++ b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetLimitPushDown.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.store.parquet;
+
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.ClusterFixtureBuilder;
+import org.apache.drill.test.ClusterTest;
+import org.apache.drill.test.QueryBuilder;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.nio.file.Paths;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class TestParquetLimitPushDown extends ClusterTest {
+
+  @BeforeClass
+  public static void setup() throws Exception {
+    ClusterFixtureBuilder builder = ClusterFixture.builder(dirTestWatcher);
+    dirTestWatcher.copyResourceToRoot(Paths.get("parquet", "multirowgroup.parquet"));
+    dirTestWatcher.copyResourceToRoot(Paths.get("parquet", "users"));
+    startCluster(builder);
+  }
+
+  @Test
+  public void testMultipleFiles() throws Exception {
+    String query = "select * from dfs.`parquet/users` limit 1";
+    QueryBuilder.QuerySummary summary = queryBuilder().sql(query).run();
+    assertTrue(summary.succeeded());
+    assertEquals(1, summary.recordCount());
+
+    String plan = queryBuilder().sql(query).explainText();
+    assertTrue(plan.contains("numRowGroups=1"));
+  }
+
+  @Test
+  public void testMultipleRowGroups() throws Exception {
+    String query = "select * from dfs.`parquet/multirowgroup.parquet` limit 1";
+    QueryBuilder.QuerySummary summary = queryBuilder().sql(query).run();
+    assertTrue(summary.succeeded());
+    assertEquals(1, summary.recordCount());
+
+    String plan = queryBuilder().sql(query).explainText();
+    assertTrue(plan.contains("numRowGroups=1"));
+  }
+
+  @Test
+  public void testLimitZero() throws Exception {
+    String query = "select * from dfs.`parquet/users` limit 0";
+    QueryBuilder.QuerySummary summary = queryBuilder().sql(query).run();
+    assertTrue(summary.succeeded());
+    assertEquals(0, summary.recordCount());
+
+    String plan = queryBuilder().sql(query).explainText();
+    assertTrue(plan.contains("numRowGroups=1"));
+  }
+
+}


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services