You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by jo...@apache.org on 2019/05/03 17:06:39 UTC

[impala] branch master updated (79c5f87 -> 99e1a39)

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git.


    from 79c5f87  IMPALA-8121: part 1: some test fixes for catalog v2
     new c2516d2  IMPALA-8409: Fix row-size for STRING columns with unknown stats
     new 04be046  IMPALA-8482: Package ranger-plugins-audit runtime dependencies
     new 99e1a39  Bump CDP_BUILD_NUMBER to 1056671

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 bin/bootstrap_toolchain.py                         |  2 +-
 bin/impala-config.sh                               | 23 +++-----
 fe/pom.xml                                         |  6 ---
 .../apache/impala/analysis/TupleDescriptor.java    |  2 +-
 .../org/apache/impala/catalog/ColumnStats.java     |  1 -
 .../org/apache/impala/planner/HdfsScanNode.java    |  3 ++
 .../org/apache/impala/catalog/CatalogTest.java     | 61 +++++++--------------
 tests/metadata/test_explain.py                     | 63 +++++++++++++++++++---
 8 files changed, 87 insertions(+), 74 deletions(-)

[impala] 02/03: IMPALA-8482: Package ranger-plugins-audit runtime dependencies

Posted by jo...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 04be046ecc3a4d43a62dc834ea4925f979d2dc27
Author: Fredy Wijaya <fw...@cloudera.com>
AuthorDate: Thu May 2 09:47:10 2019 -0700

    IMPALA-8482: Package ranger-plugins-audit runtime dependencies
    
    This patch includes ranger-plugins-audit runtime dependencies to allow
    ranger-plugins-audit communicating with different audit providers, such
    as solr, kafka, etc.
    
    Testing:
    - Ran core tests
    
    Change-Id: If4c88958b064032ebaedd45808482f1179e6d806
    Reviewed-on: http://gerrit.cloudera.org:8080/13216
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 fe/pom.xml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fe/pom.xml b/fe/pom.xml
index 2a20e58..43701b4 100644
--- a/fe/pom.xml
+++ b/fe/pom.xml
@@ -121,12 +121,6 @@ under the License.
       <groupId>org.apache.ranger</groupId>
       <artifactId>ranger-plugins-audit</artifactId>
       <version>${ranger.version}</version>
-      <exclusions>
-        <exclusion>
-          <groupId>*</groupId>
-          <artifactId>*</artifactId>
-        </exclusion>
-      </exclusions>
     </dependency>
     <!-- this is needed by ranger-plugins-audit -->
     <dependency>

[impala] 03/03: Bump CDP_BUILD_NUMBER to 1056671

Posted by jo...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 99e1a39b908b81a94ef8cf4b41458c388a34755c
Author: Vihang Karajgaonkar <vi...@cloudera.com>
AuthorDate: Wed May 1 19:07:40 2019 -0700

    Bump CDP_BUILD_NUMBER to 1056671
    
    This change bumps the CDP_BUILD_NUMBER to 1056671 which includes all the
    Hive and Tez patches required for building against Hive 3. With this
    change we get rid of the custom builds for Hive and Tez introduced in
    IMPALA-8369 and switch to more official sources of builds for the
    minicluster.
    
    Notes:
    1. The tarball names and the directory to which they extract to changed
    from the previous CDP_BUILD_NUMBER. Due to this we need to change the
    bootstrap_toolchain and impala-config.sh so that the Hive environment
    variables are set correctly.
    
    Testing Done:
    1. Built against Hive-3 and Hive-2 using the flag USE_CDP_HIVE
    2. Did basic testing from Impala and Beeline for the testing the tez
    patch
    3. Currently running the full-suite of tests to make sure there are no
    regressions
    
    Change-Id: Ic758a15b33e89b6804c12356aac8e3f230e07ae0
    Reviewed-on: http://gerrit.cloudera.org:8080/13213
    Reviewed-by: Fredy Wijaya <fw...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 bin/bootstrap_toolchain.py |  2 +-
 bin/impala-config.sh       | 23 ++++++-----------------
 2 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py
index 9bfd6c8..6a6eeb7 100755
--- a/bin/bootstrap_toolchain.py
+++ b/bin/bootstrap_toolchain.py
@@ -570,7 +570,7 @@ if __name__ == "__main__":
   ]
   use_cdp_hive = os.getenv("USE_CDP_HIVE") == "true"
   if use_cdp_hive:
-    cdp_components.append(CdpComponent("apache-hive-{0}-src"
+    cdp_components.append(CdpComponent("hive-{0}-source"
                           .format(os.environ.get("IMPALA_HIVE_VERSION")))),
     cdp_components.append(CdpComponent("apache-hive-{0}-bin"
                           .format(os.environ.get("IMPALA_HIVE_VERSION")))),
diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index 5eeb254..cc8cfef 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -161,21 +161,18 @@ fi
 export IMPALA_TOOLCHAIN_HOST
 export CDH_MAJOR_VERSION=6
 export CDH_BUILD_NUMBER=1055188
-export CDP_BUILD_NUMBER=1013201
+export CDP_BUILD_NUMBER=1056671
 export IMPALA_HADOOP_VERSION=3.0.0-cdh6.x-SNAPSHOT
 export IMPALA_HBASE_VERSION=2.1.0-cdh6.x-SNAPSHOT
 export IMPALA_SENTRY_VERSION=2.1.0-cdh6.x-SNAPSHOT
-export IMPALA_RANGER_VERSION=1.2.0.6.0.99.0-45
+export IMPALA_RANGER_VERSION=1.2.0.6.0.99.0-147
 export IMPALA_PARQUET_VERSION=1.9.0-cdh6.x-SNAPSHOT
 export IMPALA_AVRO_JAVA_VERSION=1.8.2-cdh6.x-SNAPSHOT
 export IMPALA_LLAMA_MINIKDC_VERSION=1.0.0
 export IMPALA_KITE_VERSION=1.0.0-cdh6.x-SNAPSHOT
 export KUDU_JAVA_VERSION=1.10.0-cdh6.x-SNAPSHOT
 export CDH_HIVE_VERSION=2.1.1-cdh6.x-SNAPSHOT
-# This is a custom build of Hive which includes patches for HIVE-21586
-# HIVE-21077, HIVE-21526, HIVE-21561
-# TODO Use a official once these patches are merged
-export CDP_HIVE_VERSION=3.1.0.6.0.99.0-38-0e7f6337a50
+export CDP_HIVE_VERSION=3.1.0.6.0.99.0-147
 
 # When IMPALA_(CDH_COMPONENT)_URL are overridden, they may contain '$(platform_label)'
 # which will be substituted for the CDH platform label in bootstrap_toolchain.py
@@ -202,15 +199,7 @@ if $USE_CDP_HIVE; then
   # When USE_CDP_HIVE is set we use the CDP hive version to build as well as deploy in
   # the minicluster
   export IMPALA_HIVE_VERSION=${CDP_HIVE_VERSION}
-  # Temporary version of Tez, patched with the fix for TEZ-1348:
-  # https://github.com/apache/tez/pull/40
-  # We'll switch to a non-"todd" version of Tez once that fix is integrated.
-  # For now, if you're bumping the CDP build number, you'll need to download
-  # this tarball from an earlier build and re-upload it to the new directory
-  # in the toolchain bucket.
-  #
-  # TODO(todd) switch to an official build.
-  export IMPALA_TEZ_VERSION=0.10.0-todd-6fcc41e5798b.1
+  export IMPALA_TEZ_VERSION=0.9.1.6.0.99.0-147
 else
   # CDH hive version is used to build and deploy in minicluster when USE_CDP_HIVE is
   # false
@@ -311,8 +300,8 @@ export LOCAL_FS="file:${WAREHOUSE_LOCATION_PREFIX}"
 ESCAPED_IMPALA_HOME=$(sed "s/[^0-9a-zA-Z]/_/g" <<< "$IMPALA_HOME")
 if $USE_CDP_HIVE; then
   export HIVE_HOME="$CDP_COMPONENTS_HOME/apache-hive-${IMPALA_HIVE_VERSION}-bin"
-  export HIVE_SRC_DIR=${HIVE_SRC_DIR_OVERRIDE:-"${CDP_COMPONENTS_HOME}/apache-hive-\
-${IMPALA_HIVE_VERSION}-src"}
+  export HIVE_SRC_DIR=${HIVE_SRC_DIR_OVERRIDE:-"${CDP_COMPONENTS_HOME}/hive-\
+${IMPALA_HIVE_VERSION}"}
   # Set the path to the hive_metastore.thrift which is used to build thrift code
   export HIVE_METASTORE_THRIFT_DIR=$HIVE_SRC_DIR/standalone-metastore/src/main/thrift
   # It is likely that devs will want to work with both the versions of metastore

[impala] 01/03: IMPALA-8409: Fix row-size for STRING columns with unknown stats

Posted by jo...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit c2516d220da8e532b6ebdb6f3a12e7ad97c4f597
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Wed Apr 17 15:01:07 2019 +0200

    IMPALA-8409: Fix row-size for STRING columns with unknown stats
    
    Explain returned row-size=11B for STRING columns without statistics.
    The issue was caused by adding -1 (meaning unknown) to the 12 byte
    slot size (sizeof(StringValue)). The code in TupleDescriptor.java
    tried to handle this by checking if the size is -1, but it was
    already 11 at this point.
    
    There is more potential for cleanup, but I wanted to keep this
    change minimal.
    
    Testing:
    - revived some tests in CatalogTest.java that were removed
      in 2013 due to flakiness
    - added an EE test that checks row size with and without stats
    - fixed a similar test, test_explain_validate_cardinality_estimates
      (the format of the line it looks for has changed, which lead to
      skipping the actual verification and accepting everything)
    - ran core FE and EE tests
    
    Change-Id: I866acf10b2c011a735dee019f4bc29358f2ec4e5
    Reviewed-on: http://gerrit.cloudera.org:8080/13190
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../apache/impala/analysis/TupleDescriptor.java    |  2 +-
 .../org/apache/impala/catalog/ColumnStats.java     |  1 -
 .../org/apache/impala/planner/HdfsScanNode.java    |  3 ++
 .../org/apache/impala/catalog/CatalogTest.java     | 61 +++++++--------------
 tests/metadata/test_explain.py                     | 63 +++++++++++++++++++---
 5 files changed, 80 insertions(+), 50 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/TupleDescriptor.java b/fe/src/main/java/org/apache/impala/analysis/TupleDescriptor.java
index 87b8e5f..557874d 100644
--- a/fe/src/main/java/org/apache/impala/analysis/TupleDescriptor.java
+++ b/fe/src/main/java/org/apache/impala/analysis/TupleDescriptor.java
@@ -268,7 +268,7 @@ public class TupleDescriptor {
       ColumnStats stats = d.getStats();
       int slotSize = d.getType().getSlotSize();
 
-      if (stats.hasAvgSerializedSize()) {
+      if (stats.hasAvgSize()) {
         avgSerializedSize_ += d.getStats().getAvgSerializedSize();
       } else {
         // TODO: for computed slots, try to come up with stats estimates
diff --git a/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java b/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java
index a3e40cb..65533da 100644
--- a/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java
+++ b/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java
@@ -166,7 +166,6 @@ public class ColumnStats {
   public boolean hasNulls() { return numNulls_ > 0; }
   public long getNumNulls() { return numNulls_; }
   public boolean hasAvgSize() { return avgSize_ >= 0; }
-  public boolean hasAvgSerializedSize() { return avgSerializedSize_ >= 0; }
   public boolean hasNumDistinctValues() { return numDistinctValues_ >= 0; }
   public boolean hasStats() { return numNulls_ != -1 || numDistinctValues_ != -1; }
 
diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
index ce5c850..2a2f4e6 100644
--- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
@@ -1694,6 +1694,9 @@ public class HdfsScanNode extends ScanNode {
     if (stats.hasAvgSize() && maxScanRangeNumRows_ != -1) {
       // Estimate the column's uncompressed data size based on row count and average
       // size.
+      // TODO: Size of strings seems to be underestimated, as avg size returns the
+      //       average length of the strings and does not include the 4 byte length
+      //       field used in Parquet plain encoding. (IMPALA-8431)
       reservationBytes =
           (long) Math.min(reservationBytes, stats.getAvgSize() * maxScanRangeNumRows_);
       if (stats.hasNumDistinctValues()) {
diff --git a/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java b/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java
index 80a33fc..36c9919 100644
--- a/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java
+++ b/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java
@@ -23,6 +23,7 @@ import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertSame;
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.fail;
 
 import java.io.IOException;
@@ -471,89 +472,71 @@ public class CatalogTest {
     assertEquals(1, uniqueSds.size());
   }
 
-  // TODO: All Hive-stats related tests are temporarily disabled because of an unknown,
-  // sporadic issue causing stats of some columns to be absent in Jenkins runs.
-  // Investigate this issue further.
-  //@Test
-  public void testStats() throws TableLoadingException {
+  @Test
+  public void testStats() throws CatalogException {
     // make sure the stats for functional.alltypesagg look correct
-    HdfsTable table =
-        (HdfsTable) catalog_.getDb("functional").getTable("AllTypesAgg");
+    HdfsTable table = (HdfsTable) catalog_.getOrLoadTable("functional", "AllTypesAgg");
 
     Column idCol = table.getColumn("id");
-    assertEquals(idCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.INT.getSlotSize(),
+    assertEquals(idCol.getStats().getAvgSerializedSize(),
         PrimitiveType.INT.getSlotSize(), 0.0001);
     assertEquals(idCol.getStats().getMaxSize(), PrimitiveType.INT.getSlotSize());
-    assertTrue(!idCol.getStats().hasNulls());
+    assertFalse(idCol.getStats().hasNulls());
 
     Column boolCol = table.getColumn("bool_col");
-    assertEquals(boolCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.BOOLEAN.getSlotSize(),
+    assertEquals(boolCol.getStats().getAvgSerializedSize(),
         PrimitiveType.BOOLEAN.getSlotSize(), 0.0001);
     assertEquals(boolCol.getStats().getMaxSize(), PrimitiveType.BOOLEAN.getSlotSize());
-    assertTrue(!boolCol.getStats().hasNulls());
+    assertFalse(boolCol.getStats().hasNulls());
 
     Column tinyintCol = table.getColumn("tinyint_col");
-    assertEquals(tinyintCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.TINYINT.getSlotSize(),
+    assertEquals(tinyintCol.getStats().getAvgSerializedSize(),
         PrimitiveType.TINYINT.getSlotSize(), 0.0001);
-    assertEquals(tinyintCol.getStats().getMaxSize(),
-        PrimitiveType.TINYINT.getSlotSize());
+    assertEquals(tinyintCol.getStats().getMaxSize(), PrimitiveType.TINYINT.getSlotSize());
     assertTrue(tinyintCol.getStats().hasNulls());
 
     Column smallintCol = table.getColumn("smallint_col");
-    assertEquals(smallintCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.SMALLINT.getSlotSize(),
+    assertEquals(smallintCol.getStats().getAvgSerializedSize(),
         PrimitiveType.SMALLINT.getSlotSize(), 0.0001);
     assertEquals(smallintCol.getStats().getMaxSize(),
         PrimitiveType.SMALLINT.getSlotSize());
     assertTrue(smallintCol.getStats().hasNulls());
 
     Column intCol = table.getColumn("int_col");
-    assertEquals(intCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.INT.getSlotSize(),
+    assertEquals(intCol.getStats().getAvgSerializedSize(),
         PrimitiveType.INT.getSlotSize(), 0.0001);
     assertEquals(intCol.getStats().getMaxSize(), PrimitiveType.INT.getSlotSize());
     assertTrue(intCol.getStats().hasNulls());
 
     Column bigintCol = table.getColumn("bigint_col");
-    assertEquals(bigintCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.BIGINT.getSlotSize(),
+    assertEquals(bigintCol.getStats().getAvgSerializedSize(),
         PrimitiveType.BIGINT.getSlotSize(), 0.0001);
     assertEquals(bigintCol.getStats().getMaxSize(), PrimitiveType.BIGINT.getSlotSize());
     assertTrue(bigintCol.getStats().hasNulls());
 
     Column floatCol = table.getColumn("float_col");
-    assertEquals(floatCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.FLOAT.getSlotSize(),
+    assertEquals(floatCol.getStats().getAvgSerializedSize(),
         PrimitiveType.FLOAT.getSlotSize(), 0.0001);
     assertEquals(floatCol.getStats().getMaxSize(), PrimitiveType.FLOAT.getSlotSize());
     assertTrue(floatCol.getStats().hasNulls());
 
     Column doubleCol = table.getColumn("double_col");
-    assertEquals(doubleCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.DOUBLE.getSlotSize(),
+    assertEquals(doubleCol.getStats().getAvgSerializedSize(),
         PrimitiveType.DOUBLE.getSlotSize(), 0.0001);
     assertEquals(doubleCol.getStats().getMaxSize(), PrimitiveType.DOUBLE.getSlotSize());
     assertTrue(doubleCol.getStats().hasNulls());
 
     Column timestampCol = table.getColumn("timestamp_col");
-    assertEquals(timestampCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.TIMESTAMP.getSlotSize(),
+    assertEquals(timestampCol.getStats().getAvgSerializedSize(),
         PrimitiveType.TIMESTAMP.getSlotSize(), 0.0001);
     assertEquals(timestampCol.getStats().getMaxSize(),
         PrimitiveType.TIMESTAMP.getSlotSize());
-    // this does not have nulls, it's not clear why this passes
-    // TODO: investigate and re-enable
-    //assertTrue(timestampCol.getStats().hasNulls());
+    assertFalse(timestampCol.getStats().hasNulls());
 
     Column stringCol = table.getColumn("string_col");
-    assertTrue(stringCol.getStats().getAvgSerializedSize() >=
-        PrimitiveType.STRING.getSlotSize());
     assertTrue(stringCol.getStats().getAvgSerializedSize() > 0);
     assertTrue(stringCol.getStats().getMaxSize() > 0);
-    assertTrue(!stringCol.getStats().hasNulls());
+    assertFalse(stringCol.getStats().hasNulls());
   }
 
   /**
@@ -561,10 +544,7 @@ public class CatalogTest {
    * the column type results in the stats being set to "unknown". This is a regression
    * test for IMPALA-588, where this used to result in a Preconditions failure.
    */
-  // TODO: All Hive-stats related tests are temporarily disabled because of an unknown,
-  // sporadic issue causing stats of some columns to be absent in Jenkins runs.
-  // Investigate this issue further.
-  //@Test
+  @Test
   public void testColStatsColTypeMismatch() throws Exception {
     // First load a table that has column stats.
     //catalog_.refreshTable("functional", "alltypesagg", false);
@@ -597,7 +577,7 @@ public class CatalogTest {
 
       // Now try to apply a matching column stats data and ensure it succeeds.
       assertTrue(table.getColumn("string_col").updateStats(stringColStatsData));
-      assertEquals(1178, table.getColumn("string_col").getStats().getNumDistinctValues());
+      assertEquals(963, table.getColumn("string_col").getStats().getNumDistinctValues());
     }
   }
 
@@ -606,7 +586,6 @@ public class CatalogTest {
     assertEquals(-1, column.getStats().getNumNulls());
     double expectedSize = column.getType().isFixedLengthType() ?
         column.getType().getSlotSize() : -1;
-
     assertEquals(expectedSize, column.getStats().getAvgSerializedSize(), 0.0001);
     assertEquals(expectedSize, column.getStats().getMaxSize(), 0.0001);
   }
diff --git a/tests/metadata/test_explain.py b/tests/metadata/test_explain.py
index 48a6d69..9f2d61b 100644
--- a/tests/metadata/test_explain.py
+++ b/tests/metadata/test_explain.py
@@ -70,6 +70,22 @@ class TestExplain(ImpalaTestSuite):
     vector.get_value('exec_option')['explain_level'] = 3
     self.run_test_case('QueryTest/explain-level3', vector)
 
+  @staticmethod
+  def check_row_size_and_cardinality(query_result, expected_row_size=None,
+                                     expected_cardinality=None):
+    regex = re.compile('tuple-ids=.+ row-size=(\d+)B cardinality=(.*)')
+    found_match = False
+    for res in query_result:
+      m = regex.match(res.strip())
+      if m:
+        found_match = True
+        assert len(m.groups()) == 2
+        if expected_row_size:
+          assert m.groups()[0] == expected_row_size
+        if expected_cardinality:
+          assert m.groups()[1] == expected_cardinality
+    assert found_match, query_result
+
   def test_explain_validate_cardinality_estimates(self, vector, unique_database):
     # Tests that the cardinality estimates are correct for partitioned tables.
     # TODO Cardinality estimation tests should eventually be part of the planner tests.
@@ -78,13 +94,8 @@ class TestExplain(ImpalaTestSuite):
     tbl_name = 'alltypes'
 
     def check_cardinality(query_result, expected_cardinality):
-      regex = re.compile(' row-size=\d+B cardinality=(.*)$')
-      for res in query_result:
-        m = regex.match(res.strip())
-        if m:
-          assert len(m.groups()) == 1
-          # The cardinality should be zero.
-          assert m.groups()[0] == expected_cardinality
+      self.check_row_size_and_cardinality(
+          query_result, expected_cardinality=expected_cardinality)
 
     # All partitions are filtered out, cardinality should be 0.
     result = self.execute_query("explain select * from %s.%s where year = 1900" % (
@@ -130,6 +141,44 @@ class TestExplain(ImpalaTestSuite):
         query_options={'explain_level':3})
     check_cardinality(result.data, '100')
 
+  def test_explain_row_size_estimates(self, vector, unique_database):
+    """ Tests that EXPLAIN returns the expected row sizes with and without stats.
+
+    Planner tests is probably a more logical place for this, but covering string avg_size
+    handling end-to-end seemed easier here.
+
+    Note that row sizes do not include the null indicator bytes, so actual tuple sizes
+    are a bit larger. """
+    def check_row_size(query_result, expected_row_size):
+      self.check_row_size_and_cardinality(
+          query_result, expected_row_size=expected_row_size)
+
+    def execute_explain(query):
+      return self.execute_query("explain " + query, query_options={'explain_level': 3})
+
+    FQ_TBL_NAME = unique_database + ".t"
+    self.execute_query("create table %s (i int, s string)" % FQ_TBL_NAME)
+    # Fill the table with data that leads to avg_size of 4 for 's'.
+    self.execute_query("insert into %s values (1, '123'), (2, '12345')" % FQ_TBL_NAME)
+
+    # Always use slot size for fixed sized types.
+    result = execute_explain("select i from %s" % FQ_TBL_NAME)
+    check_row_size(result.data, '4')
+
+    # If there are no stats, use slot size for variable length types.
+    result = execute_explain("select s from %s" % FQ_TBL_NAME)
+    check_row_size(result.data, "12")
+
+    self.execute_query("compute stats %s" % FQ_TBL_NAME)
+
+    # Always use slot size for fixed sized types.
+    result = execute_explain("select i from %s" % FQ_TBL_NAME)
+    check_row_size(result.data, '4')
+
+    # If there are no stats, use slot size  + avg_size for variable length types.
+    result = execute_explain("select s from %s" % FQ_TBL_NAME)
+    check_row_size(result.data, "16")
+
 
 class TestExplainEmptyPartition(ImpalaTestSuite):
   TEST_DB_NAME = "imp_1708"