You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2019/05/03 17:06:40 UTC
[impala] 01/03: IMPALA-8409: Fix row-size for STRING columns with
unknown stats
This is an automated email from the ASF dual-hosted git repository.
joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit c2516d220da8e532b6ebdb6f3a12e7ad97c4f597
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Wed Apr 17 15:01:07 2019 +0200
IMPALA-8409: Fix row-size for STRING columns with unknown stats
Explain returned row-size=11B for STRING columns without statistics.
The issue was caused by adding -1 (meaning unknown) to the 12 byte
slot size (sizeof(StringValue)). The code in TupleDescriptor.java
tried to handle this by checking if the size is -1, but it was
already 11 at this point.
There is more potential for cleanup, but I wanted to keep this
change minimal.
Testing:
- revived some tests in CatalogTest.java that were removed
in 2013 due to flakiness
- added an EE test that checks row size with and without stats
- fixed a similar test, test_explain_validate_cardinality_estimates
(the format of the line it looks for has changed, which lead to
skipping the actual verification and accepting everything)
- ran core FE and EE tests
Change-Id: I866acf10b2c011a735dee019f4bc29358f2ec4e5
Reviewed-on: http://gerrit.cloudera.org:8080/13190
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
.../apache/impala/analysis/TupleDescriptor.java | 2 +-
.../org/apache/impala/catalog/ColumnStats.java | 1 -
.../org/apache/impala/planner/HdfsScanNode.java | 3 ++
.../org/apache/impala/catalog/CatalogTest.java | 61 +++++++--------------
tests/metadata/test_explain.py | 63 +++++++++++++++++++---
5 files changed, 80 insertions(+), 50 deletions(-)
diff --git a/fe/src/main/java/org/apache/impala/analysis/TupleDescriptor.java b/fe/src/main/java/org/apache/impala/analysis/TupleDescriptor.java
index 87b8e5f..557874d 100644
--- a/fe/src/main/java/org/apache/impala/analysis/TupleDescriptor.java
+++ b/fe/src/main/java/org/apache/impala/analysis/TupleDescriptor.java
@@ -268,7 +268,7 @@ public class TupleDescriptor {
ColumnStats stats = d.getStats();
int slotSize = d.getType().getSlotSize();
- if (stats.hasAvgSerializedSize()) {
+ if (stats.hasAvgSize()) {
avgSerializedSize_ += d.getStats().getAvgSerializedSize();
} else {
// TODO: for computed slots, try to come up with stats estimates
diff --git a/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java b/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java
index a3e40cb..65533da 100644
--- a/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java
+++ b/fe/src/main/java/org/apache/impala/catalog/ColumnStats.java
@@ -166,7 +166,6 @@ public class ColumnStats {
public boolean hasNulls() { return numNulls_ > 0; }
public long getNumNulls() { return numNulls_; }
public boolean hasAvgSize() { return avgSize_ >= 0; }
- public boolean hasAvgSerializedSize() { return avgSerializedSize_ >= 0; }
public boolean hasNumDistinctValues() { return numDistinctValues_ >= 0; }
public boolean hasStats() { return numNulls_ != -1 || numDistinctValues_ != -1; }
diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
index ce5c850..2a2f4e6 100644
--- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
@@ -1694,6 +1694,9 @@ public class HdfsScanNode extends ScanNode {
if (stats.hasAvgSize() && maxScanRangeNumRows_ != -1) {
// Estimate the column's uncompressed data size based on row count and average
// size.
+ // TODO: Size of strings seems to be underestimated, as avg size returns the
+ // average length of the strings and does not include the 4 byte length
+ // field used in Parquet plain encoding. (IMPALA-8431)
reservationBytes =
(long) Math.min(reservationBytes, stats.getAvgSize() * maxScanRangeNumRows_);
if (stats.hasNumDistinctValues()) {
diff --git a/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java b/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java
index 80a33fc..36c9919 100644
--- a/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java
+++ b/fe/src/test/java/org/apache/impala/catalog/CatalogTest.java
@@ -23,6 +23,7 @@ import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertSame;
import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertFalse;
import static org.junit.Assert.fail;
import java.io.IOException;
@@ -471,89 +472,71 @@ public class CatalogTest {
assertEquals(1, uniqueSds.size());
}
- // TODO: All Hive-stats related tests are temporarily disabled because of an unknown,
- // sporadic issue causing stats of some columns to be absent in Jenkins runs.
- // Investigate this issue further.
- //@Test
- public void testStats() throws TableLoadingException {
+ @Test
+ public void testStats() throws CatalogException {
// make sure the stats for functional.alltypesagg look correct
- HdfsTable table =
- (HdfsTable) catalog_.getDb("functional").getTable("AllTypesAgg");
+ HdfsTable table = (HdfsTable) catalog_.getOrLoadTable("functional", "AllTypesAgg");
Column idCol = table.getColumn("id");
- assertEquals(idCol.getStats().getAvgSerializedSize() -
- PrimitiveType.INT.getSlotSize(),
+ assertEquals(idCol.getStats().getAvgSerializedSize(),
PrimitiveType.INT.getSlotSize(), 0.0001);
assertEquals(idCol.getStats().getMaxSize(), PrimitiveType.INT.getSlotSize());
- assertTrue(!idCol.getStats().hasNulls());
+ assertFalse(idCol.getStats().hasNulls());
Column boolCol = table.getColumn("bool_col");
- assertEquals(boolCol.getStats().getAvgSerializedSize() -
- PrimitiveType.BOOLEAN.getSlotSize(),
+ assertEquals(boolCol.getStats().getAvgSerializedSize(),
PrimitiveType.BOOLEAN.getSlotSize(), 0.0001);
assertEquals(boolCol.getStats().getMaxSize(), PrimitiveType.BOOLEAN.getSlotSize());
- assertTrue(!boolCol.getStats().hasNulls());
+ assertFalse(boolCol.getStats().hasNulls());
Column tinyintCol = table.getColumn("tinyint_col");
- assertEquals(tinyintCol.getStats().getAvgSerializedSize() -
- PrimitiveType.TINYINT.getSlotSize(),
+ assertEquals(tinyintCol.getStats().getAvgSerializedSize(),
PrimitiveType.TINYINT.getSlotSize(), 0.0001);
- assertEquals(tinyintCol.getStats().getMaxSize(),
- PrimitiveType.TINYINT.getSlotSize());
+ assertEquals(tinyintCol.getStats().getMaxSize(), PrimitiveType.TINYINT.getSlotSize());
assertTrue(tinyintCol.getStats().hasNulls());
Column smallintCol = table.getColumn("smallint_col");
- assertEquals(smallintCol.getStats().getAvgSerializedSize() -
- PrimitiveType.SMALLINT.getSlotSize(),
+ assertEquals(smallintCol.getStats().getAvgSerializedSize(),
PrimitiveType.SMALLINT.getSlotSize(), 0.0001);
assertEquals(smallintCol.getStats().getMaxSize(),
PrimitiveType.SMALLINT.getSlotSize());
assertTrue(smallintCol.getStats().hasNulls());
Column intCol = table.getColumn("int_col");
- assertEquals(intCol.getStats().getAvgSerializedSize() -
- PrimitiveType.INT.getSlotSize(),
+ assertEquals(intCol.getStats().getAvgSerializedSize(),
PrimitiveType.INT.getSlotSize(), 0.0001);
assertEquals(intCol.getStats().getMaxSize(), PrimitiveType.INT.getSlotSize());
assertTrue(intCol.getStats().hasNulls());
Column bigintCol = table.getColumn("bigint_col");
- assertEquals(bigintCol.getStats().getAvgSerializedSize() -
- PrimitiveType.BIGINT.getSlotSize(),
+ assertEquals(bigintCol.getStats().getAvgSerializedSize(),
PrimitiveType.BIGINT.getSlotSize(), 0.0001);
assertEquals(bigintCol.getStats().getMaxSize(), PrimitiveType.BIGINT.getSlotSize());
assertTrue(bigintCol.getStats().hasNulls());
Column floatCol = table.getColumn("float_col");
- assertEquals(floatCol.getStats().getAvgSerializedSize() -
- PrimitiveType.FLOAT.getSlotSize(),
+ assertEquals(floatCol.getStats().getAvgSerializedSize(),
PrimitiveType.FLOAT.getSlotSize(), 0.0001);
assertEquals(floatCol.getStats().getMaxSize(), PrimitiveType.FLOAT.getSlotSize());
assertTrue(floatCol.getStats().hasNulls());
Column doubleCol = table.getColumn("double_col");
- assertEquals(doubleCol.getStats().getAvgSerializedSize() -
- PrimitiveType.DOUBLE.getSlotSize(),
+ assertEquals(doubleCol.getStats().getAvgSerializedSize(),
PrimitiveType.DOUBLE.getSlotSize(), 0.0001);
assertEquals(doubleCol.getStats().getMaxSize(), PrimitiveType.DOUBLE.getSlotSize());
assertTrue(doubleCol.getStats().hasNulls());
Column timestampCol = table.getColumn("timestamp_col");
- assertEquals(timestampCol.getStats().getAvgSerializedSize() -
- PrimitiveType.TIMESTAMP.getSlotSize(),
+ assertEquals(timestampCol.getStats().getAvgSerializedSize(),
PrimitiveType.TIMESTAMP.getSlotSize(), 0.0001);
assertEquals(timestampCol.getStats().getMaxSize(),
PrimitiveType.TIMESTAMP.getSlotSize());
- // this does not have nulls, it's not clear why this passes
- // TODO: investigate and re-enable
- //assertTrue(timestampCol.getStats().hasNulls());
+ assertFalse(timestampCol.getStats().hasNulls());
Column stringCol = table.getColumn("string_col");
- assertTrue(stringCol.getStats().getAvgSerializedSize() >=
- PrimitiveType.STRING.getSlotSize());
assertTrue(stringCol.getStats().getAvgSerializedSize() > 0);
assertTrue(stringCol.getStats().getMaxSize() > 0);
- assertTrue(!stringCol.getStats().hasNulls());
+ assertFalse(stringCol.getStats().hasNulls());
}
/**
@@ -561,10 +544,7 @@ public class CatalogTest {
* the column type results in the stats being set to "unknown". This is a regression
* test for IMPALA-588, where this used to result in a Preconditions failure.
*/
- // TODO: All Hive-stats related tests are temporarily disabled because of an unknown,
- // sporadic issue causing stats of some columns to be absent in Jenkins runs.
- // Investigate this issue further.
- //@Test
+ @Test
public void testColStatsColTypeMismatch() throws Exception {
// First load a table that has column stats.
//catalog_.refreshTable("functional", "alltypesagg", false);
@@ -597,7 +577,7 @@ public class CatalogTest {
// Now try to apply a matching column stats data and ensure it succeeds.
assertTrue(table.getColumn("string_col").updateStats(stringColStatsData));
- assertEquals(1178, table.getColumn("string_col").getStats().getNumDistinctValues());
+ assertEquals(963, table.getColumn("string_col").getStats().getNumDistinctValues());
}
}
@@ -606,7 +586,6 @@ public class CatalogTest {
assertEquals(-1, column.getStats().getNumNulls());
double expectedSize = column.getType().isFixedLengthType() ?
column.getType().getSlotSize() : -1;
-
assertEquals(expectedSize, column.getStats().getAvgSerializedSize(), 0.0001);
assertEquals(expectedSize, column.getStats().getMaxSize(), 0.0001);
}
diff --git a/tests/metadata/test_explain.py b/tests/metadata/test_explain.py
index 48a6d69..9f2d61b 100644
--- a/tests/metadata/test_explain.py
+++ b/tests/metadata/test_explain.py
@@ -70,6 +70,22 @@ class TestExplain(ImpalaTestSuite):
vector.get_value('exec_option')['explain_level'] = 3
self.run_test_case('QueryTest/explain-level3', vector)
+ @staticmethod
+ def check_row_size_and_cardinality(query_result, expected_row_size=None,
+ expected_cardinality=None):
+ regex = re.compile('tuple-ids=.+ row-size=(\d+)B cardinality=(.*)')
+ found_match = False
+ for res in query_result:
+ m = regex.match(res.strip())
+ if m:
+ found_match = True
+ assert len(m.groups()) == 2
+ if expected_row_size:
+ assert m.groups()[0] == expected_row_size
+ if expected_cardinality:
+ assert m.groups()[1] == expected_cardinality
+ assert found_match, query_result
+
def test_explain_validate_cardinality_estimates(self, vector, unique_database):
# Tests that the cardinality estimates are correct for partitioned tables.
# TODO Cardinality estimation tests should eventually be part of the planner tests.
@@ -78,13 +94,8 @@ class TestExplain(ImpalaTestSuite):
tbl_name = 'alltypes'
def check_cardinality(query_result, expected_cardinality):
- regex = re.compile(' row-size=\d+B cardinality=(.*)$')
- for res in query_result:
- m = regex.match(res.strip())
- if m:
- assert len(m.groups()) == 1
- # The cardinality should be zero.
- assert m.groups()[0] == expected_cardinality
+ self.check_row_size_and_cardinality(
+ query_result, expected_cardinality=expected_cardinality)
# All partitions are filtered out, cardinality should be 0.
result = self.execute_query("explain select * from %s.%s where year = 1900" % (
@@ -130,6 +141,44 @@ class TestExplain(ImpalaTestSuite):
query_options={'explain_level':3})
check_cardinality(result.data, '100')
+ def test_explain_row_size_estimates(self, vector, unique_database):
+ """ Tests that EXPLAIN returns the expected row sizes with and without stats.
+
+ Planner tests is probably a more logical place for this, but covering string avg_size
+ handling end-to-end seemed easier here.
+
+ Note that row sizes do not include the null indicator bytes, so actual tuple sizes
+ are a bit larger. """
+ def check_row_size(query_result, expected_row_size):
+ self.check_row_size_and_cardinality(
+ query_result, expected_row_size=expected_row_size)
+
+ def execute_explain(query):
+ return self.execute_query("explain " + query, query_options={'explain_level': 3})
+
+ FQ_TBL_NAME = unique_database + ".t"
+ self.execute_query("create table %s (i int, s string)" % FQ_TBL_NAME)
+ # Fill the table with data that leads to avg_size of 4 for 's'.
+ self.execute_query("insert into %s values (1, '123'), (2, '12345')" % FQ_TBL_NAME)
+
+ # Always use slot size for fixed sized types.
+ result = execute_explain("select i from %s" % FQ_TBL_NAME)
+ check_row_size(result.data, '4')
+
+ # If there are no stats, use slot size for variable length types.
+ result = execute_explain("select s from %s" % FQ_TBL_NAME)
+ check_row_size(result.data, "12")
+
+ self.execute_query("compute stats %s" % FQ_TBL_NAME)
+
+ # Always use slot size for fixed sized types.
+ result = execute_explain("select i from %s" % FQ_TBL_NAME)
+ check_row_size(result.data, '4')
+
+ # If there are no stats, use slot size + avg_size for variable length types.
+ result = execute_explain("select s from %s" % FQ_TBL_NAME)
+ check_row_size(result.data, "16")
+
class TestExplainEmptyPartition(ImpalaTestSuite):
TEST_DB_NAME = "imp_1708"