You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by cs...@apache.org on 2019/08/14 09:59:37 UTC
[impala] 01/03: IMPALA-4551: Limit the size of SQL statements

This is an automated email from the ASF dual-hosted git repository.

csringhofer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 1908e44c3c9faac8c7bf09422ca4c5ec598ffd58
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Wed Jul 24 17:12:06 2019 -0700

    IMPALA-4551: Limit the size of SQL statements
    
    Various BI tools generate and run SQL. When used incorrectly or
    misconfigured, the tools can generate extremely large SQLs.
    Some of these SQL statements reach 10s of megabytes. Large SQL
    statements impose costs throughout execution, including
    statement rewrite logic in the frontend and codegen in the
    backend. The resource usage of these statements can impact
    the stability of the system or the ability to run other SQL
    statements.
    
    This implements two new query options that provide controls
    to reject large SQL statements.
     - The first, MAX_STATEMENT_LENGTH_BYTES is a cap on the
       total size of the SQL statement (in bytes). It is
       applied before any parsing or analysis. It uses a
       default value of 16MB.
     - The second, STATEMENT_EXPRESSION_LIMIT, is a limit on
       the total number of expressions in a statement or any
       views that it references. The limit is applied upon the
       first round of analysis, but it is not reapplied when
       statement rewrite rules are applied. Certain expressions
       such as literals in IN lists or VALUES clauses are not
       analyzed and do not count towards the limit. It uses
       a default value of 250,000.
    The two are complementary. Since enforcing the statement
    expression limit requires parsing and analyzing the
    statement, the MAX_STATEMENT_LENGTH_BYTES sets an upper
    bound on the size of statement that needs to be parsed
    and analyzed. Testing confirms that even statements
    approaching 16MB get through the first round of analysis
    within a few seconds and then are rejected.
    
    This also changes the logging in tests/common/impala_connection.py
    to limit the total SQL size that it will print to 128KB. This is
    prevents the JUnitXML (which includes this logging) from being too
    large. Existing tests do not run SQL larger than about 80KB, so
    this only applies to tests added in this change that run multi-MB
    SQLs to verify limits.
    
    Testing:
     - This adds frontend tests that verify the low level
       semantics about how expressions are counted and verifies
       that the expression limits are enforced.
     - This adds end-to-end tests that verify both the
       MAX_STATEMENT_LENGTH_BYTES and STATEMENT_EXPRESSION_LIMIT
       at their defaults values.
     - There is also an end-to-end test that runs in exhaustive
       mode that runs a SQL with close to 250,000 expressions.
    
    Change-Id: I5675fb4a08c1dc51ae5bcf467cbb969cc064602c
    Reviewed-on: http://gerrit.cloudera.org:8080/14012
    Reviewed-by: Joe McDonnell <jo...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/service/impala-server.cc                    |   8 ++
 be/src/service/query-options-test.cc               |   7 +-
 be/src/service/query-options.cc                    |  26 ++++
 be/src/service/query-options.h                     |  17 ++-
 common/thrift/ImpalaInternalService.thrift         |  12 ++
 common/thrift/ImpalaService.thrift                 |  14 ++
 common/thrift/generate_error_codes.py              |   3 +
 .../apache/impala/analysis/AnalysisContext.java    |   8 ++
 .../java/org/apache/impala/analysis/Analyzer.java  |  17 +++
 .../main/java/org/apache/impala/analysis/Expr.java |  23 +++
 .../apache/impala/analysis/AnalyzeExprsTest.java   | 159 +++++++++++++++++++++
 tests/common/impala_connection.py                  |  31 +++-
 tests/query_test/test_exprs.py                     |  67 ++++++++-
 13 files changed, 381 insertions(+), 11 deletions(-)

diff --git a/be/src/service/impala-server.cc b/be/src/service/impala-server.cc
index d0966ba..9c4c732 100644
--- a/be/src/service/impala-server.cc
+++ b/be/src/service/impala-server.cc
@@ -1106,6 +1106,14 @@ Status ImpalaServer::ExecuteInternal(
     }
 #endif
 
+    size_t statement_length = query_ctx.client_request.stmt.length();
+    int32_t max_statement_length =
+        query_ctx.client_request.query_options.max_statement_length_bytes;
+    if (max_statement_length > 0 && statement_length > max_statement_length) {
+      return Status(ErrorMsg(TErrorCode::MAX_STATEMENT_LENGTH_EXCEEDED,
+          statement_length, max_statement_length));
+    }
+
     RETURN_IF_ERROR((*request_state)->UpdateQueryStatus(
         exec_env_->frontend()->GetExecRequest(query_ctx, &result)));
 
diff --git a/be/src/service/query-options-test.cc b/be/src/service/query-options-test.cc
index 79f50cc..715422d 100644
--- a/be/src/service/query-options-test.cc
+++ b/be/src/service/query-options-test.cc
@@ -161,7 +161,10 @@ TEST(QueryOptions, SetByteOptions) {
           {8 * 1024, RuntimeFilterBank::MAX_BLOOM_FILTER_SIZE}},
       {MAKE_OPTIONDEF(runtime_bloom_filter_size),
           {RuntimeFilterBank::MIN_BLOOM_FILTER_SIZE,
-              RuntimeFilterBank::MAX_BLOOM_FILTER_SIZE}}};
+              RuntimeFilterBank::MAX_BLOOM_FILTER_SIZE}},
+      {MAKE_OPTIONDEF(max_statement_length_bytes),
+          {MIN_MAX_STATEMENT_LENGTH_BYTES, I32_MAX}},
+  };
   TestByteCaseSet(options, case_set_i64);
   TestByteCaseSet(options, case_set_i32);
 }
@@ -239,6 +242,8 @@ TEST(QueryOptions, SetIntOptions) {
       {MAKE_OPTIONDEF(exec_time_limit_s),              {0, I32_MAX}},
       {MAKE_OPTIONDEF(thread_reservation_limit),       {-1, I32_MAX}},
       {MAKE_OPTIONDEF(thread_reservation_aggregate_limit), {-1, I32_MAX}},
+      {MAKE_OPTIONDEF(statement_expression_limit),
+          {MIN_STATEMENT_EXPRESSION_LIMIT, I32_MAX}},
   };
   for (const auto& test_case : case_set) {
     const OptionDef<int32_t>& option_def = test_case.first;
diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc
index a84dd49..d48d0ec 100644
--- a/be/src/service/query-options.cc
+++ b/be/src/service/query-options.cc
@@ -815,6 +815,32 @@ Status impala::SetQueryOption(const string& key, const string& value,
         query_options->__set_default_transactional_type(enum_type);
         break;
       }
+      case TImpalaQueryOptions::STATEMENT_EXPRESSION_LIMIT: {
+        StringParser::ParseResult result;
+        const int32_t statement_expression_limit =
+          StringParser::StringToInt<int32_t>(value.c_str(), value.length(), &result);
+        if (result != StringParser::PARSE_SUCCESS ||
+            statement_expression_limit < MIN_STATEMENT_EXPRESSION_LIMIT) {
+          return Status(Substitute("Invalid statement expression limit: $0 "
+              "Valid values are in [$1, $2]", value, MIN_STATEMENT_EXPRESSION_LIMIT,
+              std::numeric_limits<int32_t>::max()));
+        }
+        query_options->__set_statement_expression_limit(statement_expression_limit);
+        break;
+      }
+      case TImpalaQueryOptions::MAX_STATEMENT_LENGTH_BYTES: {
+        int64_t max_statement_length_bytes;
+        RETURN_IF_ERROR(ParseMemValue(value, "max statement length bytes",
+            &max_statement_length_bytes));
+        if (max_statement_length_bytes < MIN_MAX_STATEMENT_LENGTH_BYTES ||
+            max_statement_length_bytes > std::numeric_limits<int32_t>::max()) {
+          return Status(Substitute("Invalid maximum statement length: $0 "
+              "Valid values are in [$1, $2]", max_statement_length_bytes,
+              MIN_MAX_STATEMENT_LENGTH_BYTES, std::numeric_limits<int32_t>::max()));
+        }
+        query_options->__set_max_statement_length_bytes(max_statement_length_bytes);
+        break;
+      }
       default:
         if (IsRemovedQueryOption(key)) {
           LOG(WARNING) << "Ignoring attempt to set removed query option '" << key << "'";
diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h
index 6bb7153..ff408dd 100644
--- a/be/src/service/query-options.h
+++ b/be/src/service/query-options.h
@@ -47,7 +47,7 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type>
 // time we add or remove a query option to/from the enum TImpalaQueryOptions.
 #define QUERY_OPTS_TABLE\
   DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\
-      TImpalaQueryOptions::DEFAULT_TRANSACTIONAL_TYPE + 1);\
+      TImpalaQueryOptions::MAX_STATEMENT_LENGTH_BYTES + 1);\
   REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\
   QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR)\
   REMOVED_QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)\
@@ -174,12 +174,21 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type>
   QUERY_OPT_FN(spool_query_results, SPOOL_QUERY_RESULTS,\
       TQueryOptionLevel::DEVELOPMENT)\
   QUERY_OPT_FN(default_transactional_type, DEFAULT_TRANSACTIONAL_TYPE,\
-      TQueryOptionLevel::ADVANCED)
+      TQueryOptionLevel::ADVANCED)\
+  QUERY_OPT_FN(statement_expression_limit, STATEMENT_EXPRESSION_LIMIT,\
+      TQueryOptionLevel::REGULAR)\
+  QUERY_OPT_FN(max_statement_length_bytes, MAX_STATEMENT_LENGTH_BYTES,\
+      TQueryOptionLevel::REGULAR)
   ;
 
 /// Enforce practical limits on some query options to avoid undesired query state.
-  static const int64_t SPILLABLE_BUFFER_LIMIT = 1LL << 40; // 1 TB
-  static const int64_t ROW_SIZE_LIMIT = 1LL << 40; // 1 TB
+static const int64_t SPILLABLE_BUFFER_LIMIT = 1LL << 40; // 1 TB
+static const int64_t ROW_SIZE_LIMIT = 1LL << 40; // 1 TB
+
+/// Limits on the query size are intended to be large. Prevent them from being set
+/// to small values (which can prevent clients from executing anything).
+static const int32_t MIN_STATEMENT_EXPRESSION_LIMIT = 1 << 10; // 1024
+static const int32_t MIN_MAX_STATEMENT_LENGTH_BYTES = 1 << 10; // 1 KB
 
 /// Converts a TQueryOptions struct into a map of key, value pairs.  Options that
 /// aren't set and lack defaults in common/thrift/ImpalaInternalService.thrift are
diff --git a/common/thrift/ImpalaInternalService.thrift b/common/thrift/ImpalaInternalService.thrift
index 3e12895..be79822 100644
--- a/common/thrift/ImpalaInternalService.thrift
+++ b/common/thrift/ImpalaInternalService.thrift
@@ -367,6 +367,18 @@ struct TQueryOptions {
 
   // See comment in ImpalaService.thrift
   87: optional TTransactionalType default_transactional_type = TTransactionalType.NONE;
+
+  // See comment in ImpalaService.thrift.
+  // The default of 250,000 is set to a high value to avoid impacting existing users, but
+  // testing indicates a statement with this number of expressions can run.
+  88: optional i32 statement_expression_limit = 250000
+
+  // See comment in ImpalaService.thrift
+  // The default is set to 16MB. It is likely that a statement of this size would exceed
+  // the statement expression limit. Setting a limit on the total statement size avoids
+  // the cost of parsing and analyzing the statement, which is required to enforce the
+  // statement expression limit.
+  89: optional i32 max_statement_length_bytes = 16777216
 }
 
 // Impala currently has two types of sessions: Beeswax and HiveServer2
diff --git a/common/thrift/ImpalaService.thrift b/common/thrift/ImpalaService.thrift
index ae03051..d5c57aa 100644
--- a/common/thrift/ImpalaService.thrift
+++ b/common/thrift/ImpalaService.thrift
@@ -415,6 +415,20 @@ enum TImpalaQueryOptions {
   // Speficies the default transactional type for new HDFS tables.
   // Valid values: none, insert_only
   DEFAULT_TRANSACTIONAL_TYPE = 86
+
+  // Limit on the total number of expressions in the statement. Statements that exceed
+  // the limit will get an error during analysis. This is intended to set an upper
+  // bound on the complexity of statements to avoid resource impacts such as excessive
+  // time in analysis or codegen. This is enforced only for the first pass of analysis
+  // before any rewrites are applied.
+  STATEMENT_EXPRESSION_LIMIT = 87
+
+  // Limit on the total length of a SQL statement. Statements that exceed the maximum
+  // length will get an error before parsing/analysis. This is complementary to the
+  // statement expression limit, because statements of a certain size are highly
+  // likely to violate the statement expression limit. Rejecting them early avoids
+  // the cost of parsing/analysis.
+  MAX_STATEMENT_LENGTH_BYTES = 88
 }
 
 // The summary of a DML statement.
diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index bb45c3d..d9d4d68 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -430,6 +430,9 @@ error_codes = (
   ("LZ4_DECOMPRESS_SAFE_FAILED", 141, "LZ4: LZ4_decompress_safe failed"),
 
   ("LZ4_COMPRESS_DEFAULT_FAILED", 142, "LZ4: LZ4_compress_default failed"),
+
+  ("MAX_STATEMENT_LENGTH_EXCEEDED", 143, "Statement length of $0 bytes exceeds the "
+   "maximum statement length ($1 bytes)"),
 )
 
 import sys
diff --git a/fe/src/main/java/org/apache/impala/analysis/AnalysisContext.java b/fe/src/main/java/org/apache/impala/analysis/AnalysisContext.java
index d5312f9..29cb6f8 100644
--- a/fe/src/main/java/org/apache/impala/analysis/AnalysisContext.java
+++ b/fe/src/main/java/org/apache/impala/analysis/AnalysisContext.java
@@ -454,6 +454,14 @@ public class AnalysisContext {
     Preconditions.checkNotNull(analysisResult_.stmt_);
     analysisResult_.analyzer_ = createAnalyzer(stmtTableCache);
     analysisResult_.stmt_.analyze(analysisResult_.analyzer_);
+    // Enforce the statement expression limit at the end of analysis so that there is an
+    // accurate count of the total number of expressions. The first analyze() call is not
+    // very expensive (~seconds) even for large statements. The limit on the total length
+    // of the SQL statement (max_statement_length_bytes) provides an upper bound.
+    // It is important to enforce this before expression rewrites, because rewrites are
+    // expensive with large expression trees. For example, a SQL that takes a few seconds
+    // to analyze the first time may take 10 minutes for rewrites.
+    analysisResult_.analyzer_.checkStmtExprLimit();
     boolean isExplain = analysisResult_.isExplainStmt();
 
     // Apply expr and subquery rewrites.
diff --git a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
index f0656ec..75c1fbe 100644
--- a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
+++ b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
@@ -455,6 +455,12 @@ public class Analyzer {
     // Expr rewriter for normalizing and rewriting expressions.
     private final ExprRewriter exprRewriter_;
 
+    // Total number of expressions across the statement (including all subqueries). This
+    // is used to enforce a limit on the total number of expressions. Incremented by
+    // incrementNumStmtExprs(). Note that this does not include expressions that do not
+    // require analysis (e.g. some literal expressions).
+    private int numStmtExprs_ = 0;
+
     public GlobalState(StmtTableCache stmtTableCache, TQueryCtx queryCtx,
         AuthorizationFactory authzFactory) {
       this.stmtTableCache = stmtTableCache;
@@ -2847,6 +2853,17 @@ public class Analyzer {
   public int decrementCallDepth() { return --callDepth_; }
   public int getCallDepth() { return callDepth_; }
 
+  public int incrementNumStmtExprs() { return globalState_.numStmtExprs_++; }
+  public int getNumStmtExprs() { return globalState_.numStmtExprs_; }
+  public void checkStmtExprLimit() throws AnalysisException {
+    int statementExpressionLimit = getQueryOptions().getStatement_expression_limit();
+    if (getNumStmtExprs() > statementExpressionLimit) {
+      String errorStr = String.format("Exceeded the statement expression limit (%d)\n" +
+          "Statement has %d expressions.", statementExpressionLimit, getNumStmtExprs());
+      throw new AnalysisException(errorStr);
+    }
+  }
+
   public boolean hasMutualValueTransfer(SlotId a, SlotId b) {
     return hasValueTransfer(a, b) && hasValueTransfer(b, a);
   }
diff --git a/fe/src/main/java/org/apache/impala/analysis/Expr.java b/fe/src/main/java/org/apache/impala/analysis/Expr.java
index 53a0081..1cfa938 100644
--- a/fe/src/main/java/org/apache/impala/analysis/Expr.java
+++ b/fe/src/main/java/org/apache/impala/analysis/Expr.java
@@ -347,6 +347,9 @@ abstract public class Expr extends TreeNode<Expr> implements ParseNode, Cloneabl
   // analysisDone().
   private boolean isAnalyzed_ = false;
 
+  // True if this has already been counted towards the number of statement expressions
+  private boolean isCountedForNumStmtExprs_ = false;
+
   protected Expr() {
     type_ = Type.INVALID;
     selectivity_ = -1.0;
@@ -369,6 +372,7 @@ abstract public class Expr extends TreeNode<Expr> implements ParseNode, Cloneabl
     numDistinctValues_ = other.numDistinctValues_;
     isConstant_ = other.isConstant_;
     fn_ = other.fn_;
+    isCountedForNumStmtExprs_ = other.isCountedForNumStmtExprs_;
     children_ = Expr.cloneList(other.children_);
   }
 
@@ -420,6 +424,7 @@ abstract public class Expr extends TreeNode<Expr> implements ParseNode, Cloneabl
         throw new AnalysisException(String.format("Exceeded the maximum depth of an " +
             "expression tree (%s).", EXPR_DEPTH_LIMIT));
       }
+      incrementNumStmtExprs(analyzer);
     }
     for (Expr child: children_) {
       child.analyze(analyzer);
@@ -453,6 +458,24 @@ abstract public class Expr extends TreeNode<Expr> implements ParseNode, Cloneabl
   }
 
   /**
+   * Helper function to properly count the number of statement expressions.
+   * If this expression has not been counted already and this is not a WITH clause,
+   * increment the number of statement expressions. This function guarantees that an
+   * expression will be counted at most once.
+   */
+  private void incrementNumStmtExprs(Analyzer analyzer) {
+    // WITH clauses use a separate Analyzer with its own GlobalState. Skip counting
+    // this expression towards that GlobalState. If the view defined by the WITH
+    // clause is referenced, it will be counted during that analysis.
+    if (analyzer.hasWithClause()) return;
+    // If the expression is already counted, do not count it again. This is important
+    // for expressions that can be cloned (e.g. when doing Expr::trySubstitute()).
+    if (isCountedForNumStmtExprs_) return;
+    analyzer.incrementNumStmtExprs();
+    isCountedForNumStmtExprs_ = true;
+  }
+
+  /**
    * Compute and return evalcost of this expr given the evalcost of all children has been
    * computed. Should be called bottom-up whenever the structure of subtree is modified.
    */
diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java b/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java
index 23910f4..5fcb747 100644
--- a/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java
+++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java
@@ -2399,6 +2399,165 @@ public class AnalyzeExprsTest extends AnalyzerTest {
     testFuncExprDepthLimit("cast(", "1", " as int)");
   }
 
+  /** Generates a specific number of expressions by repeating a column reference.
+   * It generates a string containing 'num_copies' expressions. If 'use_alias' is true,
+   * each column reference has a distinct alias. This allows the repeated columns to be
+   * used in places that require distinct names (e.g. the definition of a view).
+   */
+  String getRepeatedColumnReference(String col_name, int num_copies, boolean use_alias) {
+    StringBuilder repColRef = new StringBuilder();
+
+    for (int i = 0; i < num_copies; ++i) {
+      if (i != 0) repColRef.append(", ");
+      repColRef.append(col_name);
+      if (use_alias) repColRef.append(String.format(" alias%d", i));
+    }
+    return repColRef.toString();
+  }
+
+  /** Get the error message for a statement with 'actual_num_expressions' that exceeds
+   * the statment_expression_limit 'limit'.
+   */
+  String getExpressionLimitErrorStr(int actual_num_expressions, int limit) {
+    return String.format("Exceeded the statement expression limit (%d)\n" +
+        "Statement has %d expressions", limit, actual_num_expressions);
+  }
+
+  @Test
+  public void TestStatementExprLimit() {
+    // To keep it simple and fast, we use a low value for statement_expression_limit.
+    // Since the statement_expression_limit is evaluated before rewrites, turn off
+    // expression rewrites.
+    TQueryOptions queryOpts = new TQueryOptions();
+    queryOpts.setStatement_expression_limit(20);
+    queryOpts.setEnable_expr_rewrites(false);
+    AnalysisContext ctx = createAnalysisCtx(queryOpts);
+
+    // Known SQL patterns (repeated reference to column) that generate 20 and 21
+    // expressions, respectively.
+    String repCols20 = getRepeatedColumnReference("int_col", 20, true);
+    String repCols21 = getRepeatedColumnReference("int_col", 21, true);
+
+    // Select from table
+    AnalyzesOk(String.format("select %s from functional.alltypes", repCols20), ctx);
+    assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 20);
+    AnalysisError(String.format("select %s from functional.alltypes", repCols21),
+        ctx, getExpressionLimitErrorStr(21, 20));
+
+    // WHERE clause
+    // This statement has 20 expressions without the WHERE clause, as tested above.
+    // So, this will only be an error if the bool_col in the WHERE clause counts.
+    AnalysisError(String.format("select %s from functional.alltypes where bool_col",
+        repCols20), ctx, getExpressionLimitErrorStr(21, 20));
+
+    // Create table as select
+    AnalyzesOk(String.format("create table exprlimit1 as " +
+        "select %s from functional.alltypes", repCols20), ctx);
+    assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 20);
+    AnalysisError(String.format("create table exprlimit1 as " +
+        "select %s from functional.alltypes", repCols21), ctx,
+        getExpressionLimitErrorStr(21, 20));
+
+    // Create view
+    AnalyzesOk(String.format("create view exprlimit1 as " +
+        "select %s from functional.alltypes", repCols20), ctx);
+    assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 20);
+    AnalysisError(String.format("create view exprlimit1 as " +
+        "select %s from functional.alltypes", repCols21), ctx,
+        getExpressionLimitErrorStr(21, 20));
+
+    // Subquery
+    AnalyzesOk(String.format("select * from (select %s from functional.alltypes) x",
+        repCols20), ctx);
+    assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 20);
+    AnalysisError(String.format("select * from (select %s from functional.alltypes) x",
+        repCols21), ctx, getExpressionLimitErrorStr(21, 20));
+
+    // WITH clause
+    AnalyzesOk(String.format("with v as (select %s from functional.alltypes) " +
+        "select * from v", repCols20), ctx);
+    assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 20);
+    AnalysisError(String.format("with v as (select %s from functional.alltypes) " +
+        "select * from v", repCols21), ctx, getExpressionLimitErrorStr(21, 20));
+
+    // View is counted (functional.alltypes_parens has a few expressions)
+    AnalysisError(String.format("select %s from functional.alltypes_parens", repCols20),
+        ctx, getExpressionLimitErrorStr(32, 20));
+
+    // If a view is referenced multiple times, it counts multiple times.
+    AnalysisError(String.format("with v as (select %s from functional.alltypes) " +
+        "select * from v v1,v v2", repCols20), ctx, getExpressionLimitErrorStr(40, 20));
+
+    // If a view is not referenced, it doesn't count.
+    AnalyzesOk(String.format("with v as (select %s from functional.alltypes) select 1",
+        repCols21), ctx);
+    // This is zero because literals do not count.
+    assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 0);
+
+    // EXPLAIN is not exempted from the limit
+    AnalysisError(String.format("explain select %s from functional.alltypes", repCols21),
+        ctx, getExpressionLimitErrorStr(21, 20));
+
+    // Wide table doesn't impact *
+    AnalyzesOk("select * from functional.widetable_1000_cols", ctx);
+
+    // Literal expressions don't count towards the limit, so build a list of literals
+    // to use to test various cases.
+    StringBuilder literalList = new StringBuilder();
+    for (int i = 0; i < 200; ++i) {
+      if (i != 0) literalList.append(", ");
+      literalList.append(i);
+    }
+
+    // Literals in the select list do not count
+    AnalyzesOk(String.format("select %s", literalList.toString()), ctx);
+    assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 0);
+
+    // Literals in an IN list do not count
+    AnalyzesOk(String.format("select int_col IN (%s) from functional.alltypes",
+        literalList.toString()), ctx);
+
+    // Literals in a VALUES clause do not count
+    StringBuilder valuesList = new StringBuilder();
+    for (int i = 0; i < 200; ++i) {
+      if (i != 0) valuesList.append(", ");
+      valuesList.append("(" + i + ")");
+    }
+    AnalyzesOk("insert into functional.insert_overwrite_nopart (col1) VALUES " +
+        valuesList.toString(), ctx);
+    assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 0);
+
+    // Expressions that are operators applied to constants still count
+    StringBuilder constantExpr = new StringBuilder();
+    for (int i = 0; i < 21; ++i) {
+      if (i != 0) constantExpr.append(" * ");
+      constantExpr.append(i);
+    }
+    // 21 literals with 20 multiplications requires 20 ArithmeticExprs
+    AnalyzesOk(String.format("select %s", constantExpr.toString()), ctx);
+    assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 20);
+    constantExpr.append(" * 100");
+    // 22 literals with 21 multiplications requires 21 ArithmeticExprs
+    AnalysisError(String.format("select %s", constantExpr.toString()),
+        ctx, getExpressionLimitErrorStr(21, 20));
+
+    // Put a non-literal in an IN list to verify it still counts appropriately.
+    StringBuilder inListExpr = new StringBuilder();
+    for (int i = 0; i < 19; i++) {
+      if (i != 0) inListExpr.append(" * ");
+      inListExpr.append(i);
+    }
+    // 19 literals with 18 multiplications = 18 expressions. int_col and IN are another
+    // two expressions. This has 20 expressions total.
+    AnalyzesOk(String.format("select int_col IN (%s) from functional.alltypes",
+        inListExpr.toString()), ctx);
+    assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 20);
+    // Adding one more expression pushes us over the limit.
+    inListExpr.append(" * 100");
+    AnalysisError(String.format("select int_col IN (%s) from functional.alltypes",
+        inListExpr.toString()), ctx, getExpressionLimitErrorStr(21, 20));
+  }
+
   // Verifies the resulting expr decimal type is expectedType under decimal v1 or
   // decimal v2.
   private void testDecimalExpr(String expr, Type decimalExpectedType, boolean isV2) {
diff --git a/tests/common/impala_connection.py b/tests/common/impala_connection.py
index 8b275fb..f1a33bb 100644
--- a/tests/common/impala_connection.py
+++ b/tests/common/impala_connection.py
@@ -41,6 +41,26 @@ LOG.propagate = False
 PROGRESS_LOG_RE = re.compile(
     r'^Query [a-z0-9:]+ [0-9]+% Complete \([0-9]+ out of [0-9]+\)$')
 
+MAX_SQL_LOGGING_LENGTH = 128 * 1024
+
+
+# test_exprs.py's TestExprLimits executes extremely large SQLs (multiple MBs). It is the
+# only test that runs SQL larger than 128KB. Logging these SQLs in execute() increases
+# the size of the JUnitXML files, causing problems for users of JUnitXML like Jenkins.
+# This function limits the size of the SQL logged if it is larger than 128KB.
+def log_sql_stmt(sql_stmt):
+  """If the 'sql_stmt' is shorter than MAX_SQL_LOGGING_LENGTH, log it unchanged. If
+     it is larger than MAX_SQL_LOGGING_LENGTH, truncate it and comment it out."""
+  if (len(sql_stmt) <= MAX_SQL_LOGGING_LENGTH):
+    LOG.info("{0};\n".format(sql_stmt))
+  else:
+    # The logging output should be valid SQL, so the truncated SQL is commented out.
+    LOG.info("-- Skip logging full SQL statement of length {0}".format(len(sql_stmt)))
+    LOG.info("-- Logging a truncated version, commented out:")
+    for line in sql_stmt[0:MAX_SQL_LOGGING_LENGTH].split("\n"):
+      LOG.info("-- {0}".format(line))
+    LOG.info("-- [...]")
+
 # Common wrapper around the internal types of HS2/Beeswax operation/query handles.
 class OperationHandle(object):
   def __init__(self, handle, sql_stmt):
@@ -180,11 +200,13 @@ class BeeswaxConnection(ImpalaConnection):
     self.__beeswax_client.close_dml(operation_handle.get_handle())
 
   def execute(self, sql_stmt, user=None):
-    LOG.info("-- executing against %s\n%s;\n" % (self.__host_port, sql_stmt))
+    LOG.info("-- executing against %s\n" % (self.__host_port))
+    log_sql_stmt(sql_stmt)
     return self.__beeswax_client.execute(sql_stmt, user=user)
 
   def execute_async(self, sql_stmt, user=None):
-    LOG.info("-- executing async: %s\n%s;\n" % (self.__host_port, sql_stmt))
+    LOG.info("-- executing async: %s\n" % (self.__host_port))
+    log_sql_stmt(sql_stmt)
     beeswax_handle = self.__beeswax_client.execute_query_async(sql_stmt, user=user)
     return OperationHandle(beeswax_handle, sql_stmt)
 
@@ -312,8 +334,9 @@ class ImpylaHS2Connection(ImpalaConnection):
         return r
 
   def execute_async(self, sql_stmt, user=None):
-    LOG.info("-- executing against {0} at {1}\n{2};\n".format(
-        self._is_hive and 'Hive' or 'Impala', self.__host_port, sql_stmt))
+    LOG.info("-- executing against {0} at {1}\n".format(
+        self._is_hive and 'Hive' or 'Impala', self.__host_port))
+    log_sql_stmt(sql_stmt)
     if user is not None:
       raise NotImplementedError("Not yet implemented for HS2 - authentication")
     try:
diff --git a/tests/query_test/test_exprs.py b/tests/query_test/test_exprs.py
index eb50ee7..6722ca8 100644
--- a/tests/query_test/test_exprs.py
+++ b/tests/query_test/test_exprs.py
@@ -16,6 +16,8 @@
 # under the License.
 
 import pytest
+import re
+from random import randint
 
 from tests.common.impala_test_suite import ImpalaTestSuite
 from tests.common.test_dimensions import create_exec_option_dimension
@@ -131,6 +133,67 @@ class TestExprLimits(ImpalaTestSuite):
     cast_query = "select " + self.__gen_deep_func_expr("cast(", "1", " as int)")
     self.__exec_query(cast_query)
 
+  def test_under_statement_expression_limit(self):
+    """Generate a huge case statement that barely fits within the statement expression
+       limit and verify that it runs."""
+    # This takes 20+ minutes, so only run it on exhaustive.
+    # TODO: Determine whether this needs to run serially. It use >5 GB of memory.
+    if self.exploration_strategy() != 'exhaustive':
+      pytest.skip("Only test limit of codegen on exhaustive")
+    case = self.__gen_huge_case("int_col", 32, 2, "  ")
+    query = "select {0} as huge_case from functional_parquet.alltypes".format(case)
+    self.__exec_query(query)
+
+  def test_max_statement_size(self):
+    """Generate a huge case statement that exceeds the default 16MB limit and verify
+       that it gets rejected."""
+
+    expected_err_tmpl = ("Statement length of {0} bytes exceeds the maximum "
+        "statement length \({1} bytes\)")
+    size_16mb = 16 * 1024 * 1024
+
+    # Case 1: a valid SQL that would parse correctly
+    case = self.__gen_huge_case("int_col", 75, 2, "  ")
+    query = "select {0} as huge_case from functional.alltypes".format(case)
+    err = self.execute_query_expect_failure(self.client, query)
+    assert re.search(expected_err_tmpl.format(len(query), size_16mb), str(err))
+
+    # Case 2: a string of 'a' characters that does not parse. This will still fail
+    # with the same message, because the check is before parsing.
+    invalid_sql = 'a' * (size_16mb + 1)
+    err = self.execute_query_expect_failure(self.client, invalid_sql)
+    assert re.search(expected_err_tmpl.format(len(invalid_sql), size_16mb), str(err))
+
+  def test_statement_expression_limit(self):
+    """Generate a huge case statement that barely fits within the 16MB limit but exceeds
+       the statement expression limit. Verify that it fails."""
+    case = self.__gen_huge_case("int_col", 66, 2, "  ")
+    query = "select {0} as huge_case from functional.alltypes".format(case)
+    assert len(query) < 16 * 1024 * 1024
+    expected_err_re = ("Exceeded the statement expression limit \({0}\)\n"
+        "Statement has .* expressions.").format(250000)
+    err = self.execute_query_expect_failure(self.client, query)
+    assert re.search(expected_err_re, str(err))
+
+  def __gen_huge_case(self, col_name, fanout, depth, indent):
+    toks = ["case\n"]
+    for i in xrange(fanout):
+      add = randint(1, 1000000)
+      divisor = randint(1, 10000000)
+      mod = randint(0, divisor)
+      # Generate a mathematical expr that can't be easily optimised out.
+      when_expr = "{0} + {1} % {2} = {3}".format(col_name, add, divisor, mod)
+      if depth == 0:
+        then_expr = "{0}".format(i)
+      else:
+        then_expr = "({0})".format(
+            self.__gen_huge_case(col_name, fanout, depth - 1, indent + "  "))
+      toks.append(indent)
+      toks.append("when {0} then {1}\n".format(when_expr, then_expr))
+    toks.append(indent)
+    toks.append("end")
+    return ''.join(toks)
+
   def __gen_deep_infix_expr(self, prefix, repeat_suffix):
     expr = prefix
     for i in xrange(self.EXPR_DEPTH_LIMIT - 1):
@@ -150,8 +213,8 @@ class TestExprLimits(ImpalaTestSuite):
     try:
       impala_ret = self.execute_query(sql_str)
       assert impala_ret.success, "Failed to execute query %s" % (sql_str)
-    except: # consider any exception a failure
-      assert False, "Failed to execute query %s" % (sql_str)
+    except Exception as e:  # consider any exception a failure
+      assert False, "Failed to execute query %s: %s" % (sql_str, e)
 
 class TestUtcTimestampFunctions(ImpalaTestSuite):
   """Tests for UTC timestamp functions, i.e. functions that do not depend on the behavior