You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by st...@apache.org on 2022/02/12 07:02:22 UTC

[impala] branch master updated (bde9954 -> c36ff0c)

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git.


    from bde9954  IMPALA-955: BYTES built-in function
     new b02dba7  IMPALA-10648 addendum: Handle null return value from catalog's renameTable() api
     new bb23b43  IMPALA-11107: Allow specifying footer size in IssueFooterRanges
     new c36ff0c  IMPALA-10982: fix unable to explain the set operation statement

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 be/src/exec/hdfs-orc-scanner.cc                             |  2 +-
 be/src/exec/hdfs-orc-scanner.h                              | 11 ++++++++++-
 be/src/exec/hdfs-scanner.cc                                 |  7 ++++---
 be/src/exec/hdfs-scanner.h                                  | 12 ++----------
 be/src/exec/parquet/hdfs-parquet-scanner.cc                 |  5 +++--
 be/src/exec/parquet/hdfs-parquet-scanner.h                  | 10 +++++++++-
 .../java/org/apache/impala/analysis/AnalysisContext.java    |  5 ++++-
 .../catalog/metastore/CatalogMetastoreServiceHandler.java   |  2 +-
 .../impala/catalog/metastore/MetastoreServiceHandler.java   |  2 +-
 .../main/java/org/apache/impala/planner/HdfsScanNode.java   | 13 ++++++++++---
 .../functional-query/queries/QueryTest/explain-level0.test  |  8 ++++++++
 .../functional-query/queries/QueryTest/explain-level1.test  |  8 ++++++++
 .../functional-query/queries/QueryTest/explain-level2.test  |  9 +++++++++
 13 files changed, 70 insertions(+), 24 deletions(-)

[impala] 03/03: IMPALA-10982: fix unable to explain the set operation statement

Posted by st...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit c36ff0cdd7d03b0a511c78d941a32d4e78718654
Author: xqhe <he...@126.com>
AuthorDate: Sat Jan 29 11:36:11 2022 +0800

    IMPALA-10982: fix unable to explain the set operation statement
    
    For SetOperationStmt we will replace the query statement with the
    rewritten version, but we haven’t set the explain flag if the
    original is explain statement.
    
    Tests:
      -- Using impala-shell to test the explain statement of set operation.
      -- Add new test case in the explain_level tests
    
    Change-Id: I19264dfa794ffd5ed7355acfef0ac35f17c809d3
    Reviewed-on: http://gerrit.cloudera.org:8080/18179
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 fe/src/main/java/org/apache/impala/analysis/AnalysisContext.java | 5 ++++-
 .../functional-query/queries/QueryTest/explain-level0.test       | 8 ++++++++
 .../functional-query/queries/QueryTest/explain-level1.test       | 8 ++++++++
 .../functional-query/queries/QueryTest/explain-level2.test       | 9 +++++++++
 4 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/AnalysisContext.java b/fe/src/main/java/org/apache/impala/analysis/AnalysisContext.java
index 8d60f80..f0c7fe9 100644
--- a/fe/src/main/java/org/apache/impala/analysis/AnalysisContext.java
+++ b/fe/src/main/java/org/apache/impala/analysis/AnalysisContext.java
@@ -556,12 +556,15 @@ public class AnalysisContext {
     if (!shouldReAnalyze) return;
 
     // For SetOperationStmt we must replace the query statement with the rewritten version
-    // before re-analysis.
+    // before re-analysis and set the explain flag of the rewritten version if the
+    // original is explain statement.
     if (analysisResult_.requiresSetOperationRewrite()) {
       if (analysisResult_.isSetOperationStmt()) {
         if (((SetOperationStmt) analysisResult_.getStmt()).hasRewrittenStmt()) {
+          boolean isExplain = analysisResult_.isExplainStmt();
           analysisResult_.stmt_ =
             ((SetOperationStmt) analysisResult_.getStmt()).getRewrittenStmt();
+          if (isExplain) analysisResult_.stmt_.setIsExplain();
         }
       }
     }
diff --git a/testdata/workloads/functional-query/queries/QueryTest/explain-level0.test b/testdata/workloads/functional-query/queries/QueryTest/explain-level0.test
index 400124a..6ee4c0e 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/explain-level0.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/explain-level0.test
@@ -26,3 +26,11 @@ where t1.month = 1 and t2.year = 2009 and t3.bool_col = false
 'WARNING: The following tables are missing relevant table and/or column statistics.'
 'functional_avro.alltypes, functional_parquet.alltypessmall'
 ====
+---- QUERY
+# Tests the set operation statement rewrite
+explain select year, month from functional.alltypes
+  intersect
+select year, month from functional.alltypes where year=2009;
+---- RESULTS: VERIFY_IS_SUBSET
+'04:HASH JOIN [LEFT SEMI JOIN, PARTITIONED]'
+====
\ No newline at end of file
diff --git a/testdata/workloads/functional-query/queries/QueryTest/explain-level1.test b/testdata/workloads/functional-query/queries/QueryTest/explain-level1.test
index 9a6dea3..476dabc 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/explain-level1.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/explain-level1.test
@@ -39,3 +39,11 @@ where t1.month = 1 and t2.year = 2009 and t3.bool_col = false
 'WARNING: The following tables are missing relevant table and/or column statistics.'
 'functional_avro.alltypes, functional_parquet.alltypessmall'
 ====
+---- QUERY
+# Tests the set operation statement rewrite
+explain select year, month from functional.alltypes
+  intersect
+select year, month from functional.alltypes where year=2009;
+---- RESULTS: VERIFY_IS_SUBSET
+'04:HASH JOIN [LEFT SEMI JOIN, PARTITIONED]'
+====
\ No newline at end of file
diff --git a/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test b/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test
index 6670ca0..d65e4f6 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test
@@ -85,3 +85,12 @@ where t1.month = 1 and t2.year = 2009 and t3.bool_col = false
 'WARNING: The following tables are missing relevant table and/or column statistics.'
 'functional_avro.alltypes, functional_parquet.alltypessmall'
 ====
+---- QUERY
+# Tests the set operation statement rewrite
+explain select year, month from functional.alltypes
+  intersect
+select year, month from functional.alltypes where year=2009;
+---- RESULTS: VERIFY_IS_SUBSET
+'Per-Host Resources: mem-estimate=13.97MB mem-reservation=5.88MB thread-reservation=1 runtime-filters-memory=2.00MB'
+'04:HASH JOIN [LEFT SEMI JOIN, PARTITIONED]'
+====
\ No newline at end of file

[impala] 02/03: IMPALA-11107: Allow specifying footer size in IssueFooterRanges

Posted by st...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit bb23b4376239674117f2ebd5893b2c1ec8f321da
Author: Riza Suminto <ri...@cloudera.com>
AuthorDate: Wed Feb 9 16:20:09 2022 -0800

    IMPALA-11107: Allow specifying footer size in IssueFooterRanges
    
    FOOTER_SIZE was a constant of 100KB in HdfsScanner::IssueFooterRanges,
    an estimate based on Parquet format. Other scanner subclasses such as
    HdfsOrcScanner expect that footer size is much lower at
    16KB (orc::DIRECTORY_SIZE_GUESS in ORC lib). This patch adds
    footer_size_estimate as a parameter so different file formats can ask
    for different footer range sizes. Having a more precise initial range
    can also help reduce waste in the data cache.
    
    Testing:
    - Pass core tests.
    - Manually verify that 'IoReadSkippedBytes' reduced after this patch.
    
    Change-Id: Ib0a5aab48324bf9e78fb0bb0bf6f649d87e89dfc
    Reviewed-on: http://gerrit.cloudera.org:8080/18220
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/hdfs-orc-scanner.cc                             |  2 +-
 be/src/exec/hdfs-orc-scanner.h                              | 11 ++++++++++-
 be/src/exec/hdfs-scanner.cc                                 |  7 ++++---
 be/src/exec/hdfs-scanner.h                                  | 12 ++----------
 be/src/exec/parquet/hdfs-parquet-scanner.cc                 |  5 +++--
 be/src/exec/parquet/hdfs-parquet-scanner.h                  | 10 +++++++++-
 .../main/java/org/apache/impala/planner/HdfsScanNode.java   | 13 ++++++++++---
 7 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/be/src/exec/hdfs-orc-scanner.cc b/be/src/exec/hdfs-orc-scanner.cc
index 83f120c..3150a27 100644
--- a/be/src/exec/hdfs-orc-scanner.cc
+++ b/be/src/exec/hdfs-orc-scanner.cc
@@ -51,7 +51,7 @@ Status HdfsOrcScanner::IssueInitialRanges(HdfsScanNodeBase* scan_node,
           file->filename, file->file_length));
     }
   }
-  return IssueFooterRanges(scan_node, THdfsFileFormat::ORC, files);
+  return IssueFooterRanges(scan_node, THdfsFileFormat::ORC, files, ORC_FOOTER_SIZE);
 }
 
 HdfsOrcScanner::OrcMemPool::OrcMemPool(HdfsOrcScanner* scanner)
diff --git a/be/src/exec/hdfs-orc-scanner.h b/be/src/exec/hdfs-orc-scanner.h
index 67159ee..337c6a9 100644
--- a/be/src/exec/hdfs-orc-scanner.h
+++ b/be/src/exec/hdfs-orc-scanner.h
@@ -170,6 +170,15 @@ class HdfsOrcScanner : public HdfsColumnarScanner {
   static Status IssueInitialRanges(HdfsScanNodeBase* scan_node,
       const std::vector<HdfsFileDesc*>& files) WARN_UNUSED_RESULT;
 
+  /// Size of the file footer for ORC.
+  /// This is a guess, matched with orc::DIRECTORY_SIZE_GUESS from ORC lib.
+  /// If this value is too little, we will need to issue another read.
+  static const int64_t ORC_FOOTER_SIZE = 1024 * 16;
+  static_assert(ORC_FOOTER_SIZE <= READ_SIZE_MIN_VALUE,
+      "ORC_FOOTER_SIZE can not be greater than READ_SIZE_MIN_VALUE.\n"
+      "You can increase ORC_FOOTER_SIZE if you want, "
+      "just don't forget to increase READ_SIZE_MIN_VALUE as well.");
+
   virtual Status Open(ScannerContext* context) override WARN_UNUSED_RESULT;
   virtual Status ProcessSplit() override WARN_UNUSED_RESULT;
   virtual void Close(RowBatch* row_batch) override;
@@ -334,7 +343,7 @@ class HdfsOrcScanner : public HdfsColumnarScanner {
   Status TransferTuples(RowBatch* dst_batch) WARN_UNUSED_RESULT;
 
   /// Process the file footer and parse file_metadata_.  This should be called with the
-  /// last FOOTER_SIZE bytes in context_.
+  /// last ORC_FOOTER_SIZE bytes in context_.
   Status ProcessFileTail() WARN_UNUSED_RESULT;
 
   /// Resolve SchemaPath in TupleDescriptors and translate them to ORC type ids into
diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc
index 19a80d2..a5114dd 100644
--- a/be/src/exec/hdfs-scanner.cc
+++ b/be/src/exec/hdfs-scanner.cc
@@ -46,7 +46,6 @@ DEFINE_double(min_filter_reject_ratio, 0.1, "(Advanced) If the percentage of "
 
 const char* FieldLocation::LLVM_CLASS_NAME = "struct.impala::FieldLocation";
 const char* HdfsScanner::LLVM_CLASS_NAME = "class.impala::HdfsScanner";
-const int64_t HdfsScanner::FOOTER_SIZE;
 
 HdfsScanner::HdfsScanner(HdfsScanNodeBase* scan_node, RuntimeState* state)
     : scan_node_(scan_node),
@@ -824,12 +823,14 @@ void HdfsScanner::CheckFiltersEffectiveness() {
 }
 
 Status HdfsScanner::IssueFooterRanges(HdfsScanNodeBase* scan_node,
-    const THdfsFileFormat::type& file_type, const vector<HdfsFileDesc*>& files) {
+    const THdfsFileFormat::type& file_type, const vector<HdfsFileDesc*>& files,
+    int64_t footer_size_estimate) {
   DCHECK(!files.empty());
+  DCHECK_LE(footer_size_estimate, READ_SIZE_MIN_VALUE);
   vector<ScanRange*> footer_ranges;
   for (int i = 0; i < files.size(); ++i) {
     // Compute the offset of the file footer.
-    int64_t footer_size = min(FOOTER_SIZE, files[i]->file_length);
+    int64_t footer_size = min(footer_size_estimate, files[i]->file_length);
     int64_t footer_start = files[i]->file_length - footer_size;
     DCHECK_GE(footer_start, 0);
 
diff --git a/be/src/exec/hdfs-scanner.h b/be/src/exec/hdfs-scanner.h
index 6f80904..6b49f7d 100644
--- a/be/src/exec/hdfs-scanner.h
+++ b/be/src/exec/hdfs-scanner.h
@@ -381,14 +381,6 @@ class HdfsScanner {
   /// GetNext() API (i.e. mt_dop > 0), not the ProcessSplit() API.
   int64_t getnext_batches_returned_ = 0;
 
-  /// Size of the file footer for ORC and Parquet. This is a guess. If this value is too
-  /// little, we will need to issue another read.
-  static const int64_t FOOTER_SIZE = 1024 * 100;
-  static_assert(FOOTER_SIZE <= READ_SIZE_MIN_VALUE,
-      "FOOTER_SIZE can not be greater than READ_SIZE_MIN_VALUE.\n"
-      "You can increase FOOTER_SIZE if you want, "
-      "just don't forget to increase READ_SIZE_MIN_VALUE as well.");
-
   /// Check runtime filters' effectiveness every BATCHES_PER_FILTER_SELECTIVITY_CHECK
   /// row batches. Will update 'filter_stats_'.
   void CheckFiltersEffectiveness();
@@ -410,8 +402,8 @@ class HdfsScanner {
   /// Issue just the footer range for each file. This function is only used in parquet
   /// and orc scanners. We'll then parse the footer and pick out the columns we want.
   static Status IssueFooterRanges(HdfsScanNodeBase* scan_node,
-      const THdfsFileFormat::type& file_type, const std::vector<HdfsFileDesc*>& files)
-      WARN_UNUSED_RESULT;
+      const THdfsFileFormat::type& file_type, const std::vector<HdfsFileDesc*>& files,
+      int64_t footer_size_estimate) WARN_UNUSED_RESULT;
 
   /// Implements GetNext(). Should be overridden by subclasses.
   /// Only valid to call if the parent scan node is multi-threaded.
diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.cc b/be/src/exec/parquet/hdfs-parquet-scanner.cc
index 2a9ed1f..87ad679 100644
--- a/be/src/exec/parquet/hdfs-parquet-scanner.cc
+++ b/be/src/exec/parquet/hdfs-parquet-scanner.cc
@@ -79,7 +79,8 @@ Status HdfsParquetScanner::IssueInitialRanges(HdfsScanNodeBase* scan_node,
           file->filename, file->file_length));
     }
   }
-  return IssueFooterRanges(scan_node, THdfsFileFormat::PARQUET, files);
+  return IssueFooterRanges(
+      scan_node, THdfsFileFormat::PARQUET, files, PARQUET_FOOTER_SIZE);
 }
 
 HdfsParquetScanner::HdfsParquetScanner(HdfsScanNodeBase* scan_node, RuntimeState* state)
@@ -2603,7 +2604,7 @@ Status HdfsParquetScanner::ProcessFooter() {
   // expect. Note we can't detect if the file is larger than we expect without attempting
   // to read past the end of the scan range, but in this case we'll fail below trying to
   // parse the footer.
-  DCHECK_LE(scan_range_len, FOOTER_SIZE);
+  DCHECK_LE(scan_range_len, PARQUET_FOOTER_SIZE);
   uint8_t* buffer;
   bool success = stream_->ReadBytes(scan_range_len, &buffer, &parse_status_);
   if (!success) {
diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.h b/be/src/exec/parquet/hdfs-parquet-scanner.h
index da37732..122a976 100644
--- a/be/src/exec/parquet/hdfs-parquet-scanner.h
+++ b/be/src/exec/parquet/hdfs-parquet-scanner.h
@@ -381,6 +381,14 @@ class HdfsParquetScanner : public HdfsColumnarScanner {
       const vector<string>& max_vals, int start_page_idx, int end_page_idx,
       vector<PageRange>* skipped_ranges);
 
+  /// Size of the file footer for Parquet. This is a guess. If this value is too
+  /// little, we will need to issue another read.
+  static const int64_t PARQUET_FOOTER_SIZE = 1024 * 100;
+  static_assert(PARQUET_FOOTER_SIZE <= READ_SIZE_MIN_VALUE,
+      "PARQUET_FOOTER_SIZE can not be greater than READ_SIZE_MIN_VALUE.\n"
+      "You can increase PARQUET_FOOTER_SIZE if you want, "
+      "just don't forget to increase READ_SIZE_MIN_VALUE as well.");
+
  private:
   friend class ParquetColumnReader;
   friend class CollectionColumnReader;
@@ -778,7 +786,7 @@ class HdfsParquetScanner : public HdfsColumnarScanner {
       bool materialize_tuple, MemPool* pool, Tuple* tuple) const;
 
   /// Process the file footer and parse file_metadata_.  This should be called with the
-  /// last FOOTER_SIZE bytes in context_.
+  /// last PARQUET_FOOTER_SIZE bytes in context_.
   Status ProcessFooter() WARN_UNUSED_RESULT;
 
   /// Populates 'column_readers' for the slots in 'tuple_desc', including creating child
diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
index 60f339f..7e80f37 100644
--- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
@@ -155,8 +155,13 @@ public class HdfsScanNode extends ScanNode {
   // TODO: is it worth making this a tunable query option?
   private static final long DEFAULT_COLUMN_SCAN_RANGE_RESERVATION = 4L * 1024L * 1024L;
 
-  // Read size for Parquet and ORC footers. Matches HdfsScanner::FOOTER_SIZE in backend.
-  private static final long FOOTER_SIZE = 100L * 1024L;
+  // Read size for Parquet and ORC footers.
+  // Matches HdfsParquetScanner::PARQUET_FOOTER_SIZE in backend.
+  private static final long PARQUET_FOOTER_SIZE = 100L * 1024L;
+
+  // Read size for ORC footers.
+  // Matches HdfsOrcScanner::ORC_FOOTER_SIZE in backend.
+  private static final long ORC_FOOTER_SIZE = 16L * 1024L;
 
   // When the information of cardinality is not available for the underlying hdfs table,
   // i.e., the field of cardinality_ is equal to -1, we will attempt to compute an
@@ -2102,7 +2107,9 @@ public class HdfsScanNode extends ScanNode {
         for (long columnReservation : columnReservations) {
           formatReservationBytes += columnReservation;
         }
-        formatReservationBytes = Math.max(FOOTER_SIZE, formatReservationBytes);
+        long footerSize =
+            format == HdfsFileFormat.ORC ? ORC_FOOTER_SIZE : PARQUET_FOOTER_SIZE;
+        formatReservationBytes = Math.max(footerSize, formatReservationBytes);
       } else {
         // Scanners for row-oriented formats issue only one IoMgr scan range at a time.
         // Minimum reservation is based on using I/O buffer per IoMgr scan range to get

[impala] 01/03: IMPALA-10648 addendum: Handle null return value from catalog's renameTable() api

Posted by st...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit b02dba74d8f072c206c3da2eca1b492c7b44a839
Author: Sourabh Goyal <so...@cloudera.com>
AuthorDate: Thu Feb 10 10:44:34 2022 -0800

    IMPALA-10648 addendum: Handle null return value from catalog's
    renameTable() api
    
    CatalogServiceCatalog.renameTable(oldTable, newTable) api
    sometimes return null if oldTable's db does not exist
    in cache. The caller of this api should check for null
    return value.
    
    Change-Id: Ia3661b351cad16ead22608dcef33789f0ba2fb6f
    Reviewed-on: http://gerrit.cloudera.org:8080/18221
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../apache/impala/catalog/metastore/CatalogMetastoreServiceHandler.java | 2 +-
 .../org/apache/impala/catalog/metastore/MetastoreServiceHandler.java    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/catalog/metastore/CatalogMetastoreServiceHandler.java b/fe/src/main/java/org/apache/impala/catalog/metastore/CatalogMetastoreServiceHandler.java
index cdd4226..c5c66d8 100644
--- a/fe/src/main/java/org/apache/impala/catalog/metastore/CatalogMetastoreServiceHandler.java
+++ b/fe/src/main/java/org/apache/impala/catalog/metastore/CatalogMetastoreServiceHandler.java
@@ -1322,7 +1322,7 @@ public class CatalogMetastoreServiceHandler extends MetastoreServiceHandler {
         // ALTER TABLE/VIEW RENAME is implemented as an ADD + DROP.
         Pair<org.apache.impala.catalog.Table, org.apache.impala.catalog.Table> result =
             catalog_.renameTable(oldTTable, newTTable);
-        if (result.first == null && result.second == null) {
+        if (result == null || result.first == null || result.second == null) {
           throw new CatalogException("failed to rename table " + oldTTable + " to " +
               newTTable + " for " + apiName);
         }
diff --git a/fe/src/main/java/org/apache/impala/catalog/metastore/MetastoreServiceHandler.java b/fe/src/main/java/org/apache/impala/catalog/metastore/MetastoreServiceHandler.java
index 0b624c7..0db30cc 100644
--- a/fe/src/main/java/org/apache/impala/catalog/metastore/MetastoreServiceHandler.java
+++ b/fe/src/main/java/org/apache/impala/catalog/metastore/MetastoreServiceHandler.java
@@ -3127,7 +3127,7 @@ public abstract class MetastoreServiceHandler extends AbstractThriftHiveMetastor
     LOG.debug("Renaming " + tableInfo);
     Pair<org.apache.impala.catalog.Table, org.apache.impala.catalog.Table> result =
         catalog_.renameTable(oldTable, newTable);
-    if (result.first == null || result.second == null) {
+    if (result == null || result.first == null || result.second == null) {
       LOG.debug("Couldn't rename " + tableInfo);
     } else {
       LOG.info("Successfully renamed " + tableInfo);