You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2019/05/03 17:06:40 UTC

[impala] 01/03: IMPALA-8409: Fix row-size for STRING columns with unknown stats

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit c2516d220da8e532b6ebdb6f3a12e7ad97c4f597
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Wed Apr 17 15:01:07 2019 +0200

    IMPALA-8409: Fix row-size for STRING columns with unknown stats
    
    Explain returned row-size=11B for STRING columns without statistics.
    The issue was caused by adding -1 (meaning unknown) to the 12 byte
    slot size (sizeof(StringValue)). The code in TupleDescriptor.java
    tried to handle this by checking if the size is -1, but it was
    already 11 at this point.
    
    There is more potential for cleanup, but I wanted to keep this
    change minimal.
    
    Testing:
    - revived some tests in CatalogTest.java that were removed
      in 2013 due to flakiness
    - added an EE test that checks row size with and without stats
    - fixed a similar test, test_explain_validate_cardinality_estimates
      (the format of the line it looks for has changed, which lead to
      skipping the actual verification and accepting everything)
    - ran core FE and EE tests
    
    Change-Id: I866acf10b2c011a735dee019f4bc29358f2ec4e5
    Reviewed-on: http://gerrit.cloudera.org:8080/13190
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../apache/impala/analysis/TupleDescriptor.java    |  2 +-
 .../org/apache/impala/catalog/ColumnStats.java     |  1 -
 .../org/apache/impala/planner/HdfsScanNode.java    |  3 ++
 .../org/apache/impala/catalog/CatalogTest.java     | 61 +++++++--------------
 tests/metadata/test_explain.py                     | 63 +++++++++++++++++++---
 5 files changed, 80 insertions(+), 50 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/TupleDescriptor.java b/fe/src/main/java/org/apache/impala/analysis/TupleDescriptor.java
index 87b8e5f..557874d 100644
--- a/fe/src/main/java/org/apache/impala/analysis/TupleDescriptor.java
+++ b/fe/src/main/java/org/apache/impala/analysis/TupleDescriptor.java
@@ -268,7 +268,7 @@ public class TupleDescriptor {
       ColumnStats stats = d.getStats();
       int slotSize = d.getType().getSlotSize();
 
-      if (stats.hasAvgSerializedSize()) {
+      if (stats.hasAvgSize()) {
         avgSerializedSize_ += d.getStats().getAvgSerializedSize();
       } else {
         // TODO: for computed slots, try to come up with stats estimates
diff --git a/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java b/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java
index a3e40cb..65533da 100644
--- a/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java
+++ b/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java
@@ -166,7 +166,6 @@ public class ColumnStats {
   public boolean hasNulls() { return numNulls_ > 0; }
   public long getNumNulls() { return numNulls_; }
   public boolean hasAvgSize() { return avgSize_ >= 0; }
-  public boolean hasAvgSerializedSize() { return avgSerializedSize_ >= 0; }
   public boolean hasNumDistinctValues() { return numDistinctValues_ >= 0; }
   public boolean hasStats() { return numNulls_ != -1 || numDistinctValues_ != -1; }
 
diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
index ce5c850..2a2f4e6 100644
--- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
@@ -1694,6 +1694,9 @@ public class HdfsScanNode extends ScanNode {
     if (stats.hasAvgSize() && maxScanRangeNumRows_ != -1) {
       // Estimate the column's uncompressed data size based on row count and average
       // size.
+      // TODO: Size of strings seems to be underestimated, as avg size returns the
+      //       average length of the strings and does not include the 4 byte length
+      //       field used in Parquet plain encoding. (IMPALA-8431)
       reservationBytes =
           (long) Math.min(reservationBytes, stats.getAvgSize() * maxScanRangeNumRows_);
       if (stats.hasNumDistinctValues()) {
diff --git a/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java b/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java
index 80a33fc..36c9919 100644
--- a/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java
+++ b/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java
@@ -23,6 +23,7 @@ import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertSame;
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.fail;
 
 import java.io.IOException;
@@ -471,89 +472,71 @@ public class CatalogTest {
     assertEquals(1, uniqueSds.size());
   }
 
-  // TODO: All Hive-stats related tests are temporarily disabled because of an unknown,
-  // sporadic issue causing stats of some columns to be absent in Jenkins runs.
-  // Investigate this issue further.
-  //@Test
-  public void testStats() throws TableLoadingException {
+  @Test
+  public void testStats() throws CatalogException {
     // make sure the stats for functional.alltypesagg look correct
-    HdfsTable table =
-        (HdfsTable) catalog_.getDb("functional").getTable("AllTypesAgg");
+    HdfsTable table = (HdfsTable) catalog_.getOrLoadTable("functional", "AllTypesAgg");
 
     Column idCol = table.getColumn("id");
-    assertEquals(idCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.INT.getSlotSize(),
+    assertEquals(idCol.getStats().getAvgSerializedSize(),
         PrimitiveType.INT.getSlotSize(), 0.0001);
     assertEquals(idCol.getStats().getMaxSize(), PrimitiveType.INT.getSlotSize());
-    assertTrue(!idCol.getStats().hasNulls());
+    assertFalse(idCol.getStats().hasNulls());
 
     Column boolCol = table.getColumn("bool_col");
-    assertEquals(boolCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.BOOLEAN.getSlotSize(),
+    assertEquals(boolCol.getStats().getAvgSerializedSize(),
         PrimitiveType.BOOLEAN.getSlotSize(), 0.0001);
     assertEquals(boolCol.getStats().getMaxSize(), PrimitiveType.BOOLEAN.getSlotSize());
-    assertTrue(!boolCol.getStats().hasNulls());
+    assertFalse(boolCol.getStats().hasNulls());
 
     Column tinyintCol = table.getColumn("tinyint_col");
-    assertEquals(tinyintCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.TINYINT.getSlotSize(),
+    assertEquals(tinyintCol.getStats().getAvgSerializedSize(),
         PrimitiveType.TINYINT.getSlotSize(), 0.0001);
-    assertEquals(tinyintCol.getStats().getMaxSize(),
-        PrimitiveType.TINYINT.getSlotSize());
+    assertEquals(tinyintCol.getStats().getMaxSize(), PrimitiveType.TINYINT.getSlotSize());
     assertTrue(tinyintCol.getStats().hasNulls());
 
     Column smallintCol = table.getColumn("smallint_col");
-    assertEquals(smallintCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.SMALLINT.getSlotSize(),
+    assertEquals(smallintCol.getStats().getAvgSerializedSize(),
         PrimitiveType.SMALLINT.getSlotSize(), 0.0001);
     assertEquals(smallintCol.getStats().getMaxSize(),
         PrimitiveType.SMALLINT.getSlotSize());
     assertTrue(smallintCol.getStats().hasNulls());
 
     Column intCol = table.getColumn("int_col");
-    assertEquals(intCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.INT.getSlotSize(),
+    assertEquals(intCol.getStats().getAvgSerializedSize(),
         PrimitiveType.INT.getSlotSize(), 0.0001);
     assertEquals(intCol.getStats().getMaxSize(), PrimitiveType.INT.getSlotSize());
     assertTrue(intCol.getStats().hasNulls());
 
     Column bigintCol = table.getColumn("bigint_col");
-    assertEquals(bigintCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.BIGINT.getSlotSize(),
+    assertEquals(bigintCol.getStats().getAvgSerializedSize(),
         PrimitiveType.BIGINT.getSlotSize(), 0.0001);
     assertEquals(bigintCol.getStats().getMaxSize(), PrimitiveType.BIGINT.getSlotSize());
     assertTrue(bigintCol.getStats().hasNulls());
 
     Column floatCol = table.getColumn("float_col");
-    assertEquals(floatCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.FLOAT.getSlotSize(),
+    assertEquals(floatCol.getStats().getAvgSerializedSize(),
         PrimitiveType.FLOAT.getSlotSize(), 0.0001);
     assertEquals(floatCol.getStats().getMaxSize(), PrimitiveType.FLOAT.getSlotSize());
     assertTrue(floatCol.getStats().hasNulls());
 
     Column doubleCol = table.getColumn("double_col");
-    assertEquals(doubleCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.DOUBLE.getSlotSize(),
+    assertEquals(doubleCol.getStats().getAvgSerializedSize(),
         PrimitiveType.DOUBLE.getSlotSize(), 0.0001);
     assertEquals(doubleCol.getStats().getMaxSize(), PrimitiveType.DOUBLE.getSlotSize());
     assertTrue(doubleCol.getStats().hasNulls());
 
     Column timestampCol = table.getColumn("timestamp_col");
-    assertEquals(timestampCol.getStats().getAvgSerializedSize() -
-        PrimitiveType.TIMESTAMP.getSlotSize(),
+    assertEquals(timestampCol.getStats().getAvgSerializedSize(),
         PrimitiveType.TIMESTAMP.getSlotSize(), 0.0001);
     assertEquals(timestampCol.getStats().getMaxSize(),
         PrimitiveType.TIMESTAMP.getSlotSize());
-    // this does not have nulls, it's not clear why this passes
-    // TODO: investigate and re-enable
-    //assertTrue(timestampCol.getStats().hasNulls());
+    assertFalse(timestampCol.getStats().hasNulls());
 
     Column stringCol = table.getColumn("string_col");
-    assertTrue(stringCol.getStats().getAvgSerializedSize() >=
-        PrimitiveType.STRING.getSlotSize());
     assertTrue(stringCol.getStats().getAvgSerializedSize() > 0);
     assertTrue(stringCol.getStats().getMaxSize() > 0);
-    assertTrue(!stringCol.getStats().hasNulls());
+    assertFalse(stringCol.getStats().hasNulls());
   }
 
   /**
@@ -561,10 +544,7 @@ public class CatalogTest {
    * the column type results in the stats being set to "unknown". This is a regression
    * test for IMPALA-588, where this used to result in a Preconditions failure.
    */
-  // TODO: All Hive-stats related tests are temporarily disabled because of an unknown,
-  // sporadic issue causing stats of some columns to be absent in Jenkins runs.
-  // Investigate this issue further.
-  //@Test
+  @Test
   public void testColStatsColTypeMismatch() throws Exception {
     // First load a table that has column stats.
     //catalog_.refreshTable("functional", "alltypesagg", false);
@@ -597,7 +577,7 @@ public class CatalogTest {
 
       // Now try to apply a matching column stats data and ensure it succeeds.
       assertTrue(table.getColumn("string_col").updateStats(stringColStatsData));
-      assertEquals(1178, table.getColumn("string_col").getStats().getNumDistinctValues());
+      assertEquals(963, table.getColumn("string_col").getStats().getNumDistinctValues());
     }
   }
 
@@ -606,7 +586,6 @@ public class CatalogTest {
     assertEquals(-1, column.getStats().getNumNulls());
     double expectedSize = column.getType().isFixedLengthType() ?
         column.getType().getSlotSize() : -1;
-
     assertEquals(expectedSize, column.getStats().getAvgSerializedSize(), 0.0001);
     assertEquals(expectedSize, column.getStats().getMaxSize(), 0.0001);
   }
diff --git a/tests/metadata/test_explain.py b/tests/metadata/test_explain.py
index 48a6d69..9f2d61b 100644
--- a/tests/metadata/test_explain.py
+++ b/tests/metadata/test_explain.py
@@ -70,6 +70,22 @@ class TestExplain(ImpalaTestSuite):
     vector.get_value('exec_option')['explain_level'] = 3
     self.run_test_case('QueryTest/explain-level3', vector)
 
+  @staticmethod
+  def check_row_size_and_cardinality(query_result, expected_row_size=None,
+                                     expected_cardinality=None):
+    regex = re.compile('tuple-ids=.+ row-size=(\d+)B cardinality=(.*)')
+    found_match = False
+    for res in query_result:
+      m = regex.match(res.strip())
+      if m:
+        found_match = True
+        assert len(m.groups()) == 2
+        if expected_row_size:
+          assert m.groups()[0] == expected_row_size
+        if expected_cardinality:
+          assert m.groups()[1] == expected_cardinality
+    assert found_match, query_result
+
   def test_explain_validate_cardinality_estimates(self, vector, unique_database):
     # Tests that the cardinality estimates are correct for partitioned tables.
     # TODO Cardinality estimation tests should eventually be part of the planner tests.
@@ -78,13 +94,8 @@ class TestExplain(ImpalaTestSuite):
     tbl_name = 'alltypes'
 
     def check_cardinality(query_result, expected_cardinality):
-      regex = re.compile(' row-size=\d+B cardinality=(.*)$')
-      for res in query_result:
-        m = regex.match(res.strip())
-        if m:
-          assert len(m.groups()) == 1
-          # The cardinality should be zero.
-          assert m.groups()[0] == expected_cardinality
+      self.check_row_size_and_cardinality(
+          query_result, expected_cardinality=expected_cardinality)
 
     # All partitions are filtered out, cardinality should be 0.
     result = self.execute_query("explain select * from %s.%s where year = 1900" % (
@@ -130,6 +141,44 @@ class TestExplain(ImpalaTestSuite):
         query_options={'explain_level':3})
     check_cardinality(result.data, '100')
 
+  def test_explain_row_size_estimates(self, vector, unique_database):
+    """ Tests that EXPLAIN returns the expected row sizes with and without stats.
+
+    Planner tests is probably a more logical place for this, but covering string avg_size
+    handling end-to-end seemed easier here.
+
+    Note that row sizes do not include the null indicator bytes, so actual tuple sizes
+    are a bit larger. """
+    def check_row_size(query_result, expected_row_size):
+      self.check_row_size_and_cardinality(
+          query_result, expected_row_size=expected_row_size)
+
+    def execute_explain(query):
+      return self.execute_query("explain " + query, query_options={'explain_level': 3})
+
+    FQ_TBL_NAME = unique_database + ".t"
+    self.execute_query("create table %s (i int, s string)" % FQ_TBL_NAME)
+    # Fill the table with data that leads to avg_size of 4 for 's'.
+    self.execute_query("insert into %s values (1, '123'), (2, '12345')" % FQ_TBL_NAME)
+
+    # Always use slot size for fixed sized types.
+    result = execute_explain("select i from %s" % FQ_TBL_NAME)
+    check_row_size(result.data, '4')
+
+    # If there are no stats, use slot size for variable length types.
+    result = execute_explain("select s from %s" % FQ_TBL_NAME)
+    check_row_size(result.data, "12")
+
+    self.execute_query("compute stats %s" % FQ_TBL_NAME)
+
+    # Always use slot size for fixed sized types.
+    result = execute_explain("select i from %s" % FQ_TBL_NAME)
+    check_row_size(result.data, '4')
+
+    # If there are no stats, use slot size  + avg_size for variable length types.
+    result = execute_explain("select s from %s" % FQ_TBL_NAME)
+    check_row_size(result.data, "16")
+
 
 class TestExplainEmptyPartition(ImpalaTestSuite):
   TEST_DB_NAME = "imp_1708"