You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kylin.apache.org by xx...@apache.org on 2020/12/18 08:57:32 UTC
[kylin] branch kylin-on-parquet-v2 updated: KYLIN-4845 Fix
NFilePruningTest report dup key error
This is an automated email from the ASF dual-hosted git repository.
xxyu pushed a commit to branch kylin-on-parquet-v2
in repository https://gitbox.apache.org/repos/asf/kylin.git
The following commit(s) were added to refs/heads/kylin-on-parquet-v2 by this push:
new 49dcdd2 KYLIN-4845 Fix NFilePruningTest report dup key error
49dcdd2 is described below
commit 49dcdd270a0a014123e4ba1b586c49e6dd508540
Author: yaqian.zhang <59...@qq.com>
AuthorDate: Fri Dec 18 15:31:49 2020 +0800
KYLIN-4845 Fix NFilePruningTest report dup key error
---
.../file_prunning/cube_desc/file_pruning_cube.json | 61 +++++++++-------------
.../cube_desc/file_pruning_cube2.json | 2 +-
.../model_desc/file_pruning_model.json | 22 ++++----
.../model_desc/file_pruning_model2.json | 40 --------------
.../file_prunning/project/default.json | 2 +-
.../engine/spark/builder/CubeSnapshotBuilder.scala | 2 +-
.../spark2/file_pruning/NFilePruningTest.java | 49 +++++++++--------
7 files changed, 62 insertions(+), 116 deletions(-)
diff --git a/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube.json b/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube.json
index 01581d9..0ec4904 100644
--- a/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube.json
+++ b/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube.json
@@ -1,6 +1,6 @@
{
"uuid" : "330b1839-1baf-5e4f-7f4c-ad173a5217c8",
- "last_modified" : 1589194835241,
+ "last_modified" : 1589268257682,
"version" : "3.9.9.1",
"name" : "file_pruning_cube",
"is_draft" : false,
@@ -9,29 +9,14 @@
"null_string" : null,
"dimensions" : [ {
"name" : "ORDER_ID",
- "table" : "TEST_ORDER",
+ "table" : "TEST_KYLIN_FACT",
"column" : "ORDER_ID",
"derived" : null
}, {
- "name" : "BUYER_ID",
- "table" : "TEST_ORDER",
- "column" : "BUYER_ID",
- "derived" : null
- }, {
- "name" : "TEST_DATE_ENC",
- "table" : "TEST_ORDER",
- "column" : "TEST_DATE_ENC",
- "derived" : null
- }, {
- "name" : "TEST_TIME_ENC",
- "table" : "TEST_ORDER",
- "column" : "TEST_TIME_ENC",
- "derived" : null
- }, {
- "name" : "ORDER_ID",
+ "name" : "CAL_DT",
"table" : "TEST_KYLIN_FACT",
- "column" : null,
- "derived" : [ "ORDER_ID" ]
+ "column" : "CAL_DT",
+ "derived" : null
}, {
"name" : "LSTG_FORMAT_NAME",
"table" : "TEST_KYLIN_FACT",
@@ -52,6 +37,16 @@
"table" : "TEST_KYLIN_FACT",
"column" : "PRICE",
"derived" : null
+ }, {
+ "name" : "ORDER_ID",
+ "table" : "TEST_ORDER",
+ "column" : null,
+ "derived" : [ "ORDER_ID" ]
+ }, {
+ "name" : "TEST_DATE_ENC",
+ "table" : "TEST_ORDER",
+ "column" : null,
+ "derived" : [ "TEST_DATE_ENC" ]
} ],
"measures" : [ {
"name" : "_COUNT_",
@@ -67,20 +62,20 @@
"dictionaries" : [ ],
"rowkey" : {
"rowkey_columns" : [ {
- "column" : "TEST_ORDER.ORDER_ID",
+ "column" : "TEST_KYLIN_FACT.ORDER_ID",
"encoding" : "dict",
"isShardBy" : false
}, {
- "column" : "TEST_ORDER.BUYER_ID",
- "encoding" : "dict",
+ "column" : "TEST_KYLIN_FACT.CAL_DT",
+ "encoding" : "date",
"isShardBy" : false
}, {
- "column" : "TEST_ORDER.TEST_DATE_ENC",
- "encoding" : "date",
+ "column" : "TEST_KYLIN_FACT.LSTG_FORMAT_NAME",
+ "encoding" : "dict",
"isShardBy" : false
}, {
- "column" : "TEST_ORDER.TEST_TIME_ENC",
- "encoding" : "time",
+ "column" : "TEST_KYLIN_FACT.SLR_SEGMENT_CD",
+ "encoding" : "dict",
"isShardBy" : false
}, {
"column" : "TEST_KYLIN_FACT.SELLER_ID",
@@ -90,14 +85,6 @@
"column" : "TEST_KYLIN_FACT.PRICE",
"encoding" : "dict",
"isShardBy" : false
- }, {
- "column" : "TEST_KYLIN_FACT.SLR_SEGMENT_CD",
- "encoding" : "dict",
- "isShardBy" : false
- } , {
- "column" : "TEST_KYLIN_FACT.LSTG_FORMAT_NAME",
- "encoding" : "dict",
- "isShardBy" : false
} ]
},
"hbase_mapping" : {
@@ -110,14 +97,14 @@
} ]
},
"aggregation_groups" : [ {
- "includes" : [ "TEST_ORDER.ORDER_ID", "TEST_ORDER.BUYER_ID", "TEST_ORDER.TEST_DATE_ENC", "TEST_ORDER.TEST_TIME_ENC" ],
+ "includes" : [ "TEST_KYLIN_FACT.ORDER_ID", "TEST_KYLIN_FACT.CAL_DT", "TEST_KYLIN_FACT.LSTG_FORMAT_NAME", "TEST_KYLIN_FACT.SLR_SEGMENT_CD", "TEST_KYLIN_FACT.SELLER_ID", "TEST_KYLIN_FACT.PRICE" ],
"select_rule" : {
"hierarchy_dims" : [ ],
"mandatory_dims" : [ ],
"joint_dims" : [ ]
}
} ],
- "signature" : "2BsFyXxmB2F+FNlH4RxDwQ==",
+ "signature" : "ioyeFiil5XMOa8G02uVhgQ==",
"notify_list" : [ ],
"status_need_notify" : [ "ERROR", "DISCARDED", "SUCCEED" ],
"partition_date_start" : 0,
diff --git a/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube2.json b/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube2.json
index f9fa85b..8e2bd3a 100644
--- a/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube2.json
+++ b/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube2.json
@@ -4,7 +4,7 @@
"version" : "3.9.9.1",
"name" : "file_pruning_cube2",
"is_draft" : false,
- "model_name" : "file_pruning_model2",
+ "model_name" : "file_pruning_model",
"description" : "",
"null_string" : null,
"dimensions" : [ {
diff --git a/examples/test_case_data/file_prunning/model_desc/file_pruning_model.json b/examples/test_case_data/file_prunning/model_desc/file_pruning_model.json
index fb35bd8..018a384 100644
--- a/examples/test_case_data/file_prunning/model_desc/file_pruning_model.json
+++ b/examples/test_case_data/file_prunning/model_desc/file_pruning_model.json
@@ -1,33 +1,33 @@
{
"uuid" : "1433c51f-fa34-3c12-6d10-d4a59338a19d",
- "last_modified" : 1589194803840,
+ "last_modified" : 0,
"version" : "3.9.9.1",
"name" : "file_pruning_model",
"owner" : "ADMIN",
"is_draft" : false,
"description" : "",
- "fact_table" : "DEFAULT.TEST_ORDER",
+ "fact_table" : "DEFAULT.TEST_KYLIN_FACT",
"lookups" : [ {
- "table" : "DEFAULT.TEST_KYLIN_FACT",
+ "table" : "DEFAULT.TEST_ORDER",
"kind" : "LOOKUP",
- "alias" : "TEST_KYLIN_FACT",
+ "alias" : "TEST_ORDER",
"join" : {
"type" : "left",
- "primary_key" : [ "TEST_KYLIN_FACT.ORDER_ID" ],
- "foreign_key" : [ "TEST_ORDER.ORDER_ID" ]
+ "primary_key" : [ "TEST_ORDER.ORDER_ID" ],
+ "foreign_key" : [ "TEST_KYLIN_FACT.ORDER_ID" ]
}
} ],
"dimensions" : [ {
- "table" : "TEST_ORDER",
- "columns" : [ "ORDER_ID", "BUYER_ID", "TEST_DATE_ENC", "TEST_TIME_ENC" ]
- }, {
"table" : "TEST_KYLIN_FACT",
- "columns" : [ "SELLER_ID", "PRICE", "SLR_SEGMENT_CD", "LSTG_FORMAT_NAME", "ORDER_ID" ]
+ "columns" : [ "SELLER_ID", "PRICE", "SLR_SEGMENT_CD", "LSTG_FORMAT_NAME", "CAL_DT", "ORDER_ID" ]
+ }, {
+ "table" : "TEST_ORDER",
+ "columns" : [ "ORDER_ID", "TEST_DATE_ENC" ]
} ],
"metrics" : [ ],
"filter_condition" : "",
"partition_desc" : {
- "partition_date_column" : "TEST_ORDER.TEST_TIME_ENC",
+ "partition_date_column" : "TEST_KYLIN_FACT.CAL_DT",
"partition_time_column" : null,
"partition_date_start" : 0,
"partition_date_format" : "yyyy-MM-dd",
diff --git a/examples/test_case_data/file_prunning/model_desc/file_pruning_model2.json b/examples/test_case_data/file_prunning/model_desc/file_pruning_model2.json
deleted file mode 100644
index 2f29fc6..0000000
--- a/examples/test_case_data/file_prunning/model_desc/file_pruning_model2.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
- "uuid" : "7622b8c5-a5fb-e10b-4038-4cbccfc2df24",
- "last_modified" : 1589268019470,
- "version" : "3.9.9.1",
- "name" : "file_pruning_model2",
- "owner" : "ADMIN",
- "is_draft" : false,
- "description" : "",
- "fact_table" : "DEFAULT.TEST_KYLIN_FACT",
- "lookups" : [ {
- "table" : "DEFAULT.TEST_ORDER",
- "kind" : "LOOKUP",
- "alias" : "TEST_ORDER",
- "join" : {
- "type" : "left",
- "primary_key" : [ "TEST_ORDER.ORDER_ID" ],
- "foreign_key" : [ "TEST_KYLIN_FACT.ORDER_ID" ]
- }
- } ],
- "dimensions" : [ {
- "table" : "TEST_KYLIN_FACT",
- "columns" : [ "SELLER_ID", "PRICE", "SLR_SEGMENT_CD", "LSTG_FORMAT_NAME", "CAL_DT", "ORDER_ID" ]
- }, {
- "table" : "TEST_ORDER",
- "columns" : [ "ORDER_ID", "TEST_DATE_ENC" ]
- } ],
- "metrics" : [ ],
- "filter_condition" : "",
- "partition_desc" : {
- "partition_date_column" : "TEST_KYLIN_FACT.CAL_DT",
- "partition_time_column" : null,
- "partition_date_start" : 0,
- "partition_date_format" : "yyyy-MM-dd",
- "partition_time_format" : "HH:mm:ss",
- "partition_type" : "APPEND",
- "partition_condition_builder" : "org.apache.kylin.metadata.model.PartitionDesc$DefaultPartitionConditionBuilder"
- },
- "capacity" : "MEDIUM",
- "projectName" : "default"
-}
\ No newline at end of file
diff --git a/examples/test_case_data/file_prunning/project/default.json b/examples/test_case_data/file_prunning/project/default.json
index 9ebc29f..e983b5d 100644
--- a/examples/test_case_data/file_prunning/project/default.json
+++ b/examples/test_case_data/file_prunning/project/default.json
@@ -19,7 +19,7 @@
"type" : "CUBE",
"realization" : "file_pruning_cube_measure"
} ],
- "models" : [ "file_pruning_model","file_pruning_model2","file_pruning_model_measure" ],
+ "models" : [ "file_pruning_model","file_pruning_model_measure" ],
"ext_filters" : [ ],
"override_kylin_properties" : { }
}
\ No newline at end of file
diff --git a/kylin-spark-project/kylin-spark-engine/src/main/scala/org/apache/kylin/engine/spark/builder/CubeSnapshotBuilder.scala b/kylin-spark-project/kylin-spark-engine/src/main/scala/org/apache/kylin/engine/spark/builder/CubeSnapshotBuilder.scala
index d1b62f0..146dcab 100644
--- a/kylin-spark-project/kylin-spark-engine/src/main/scala/org/apache/kylin/engine/spark/builder/CubeSnapshotBuilder.scala
+++ b/kylin-spark-project/kylin-spark-engine/src/main/scala/org/apache/kylin/engine/spark/builder/CubeSnapshotBuilder.scala
@@ -195,7 +195,7 @@ class CubeSnapshotBuilder extends Logging {
val lookupTablePKS = joinDesc.PKS.map(lookupTablePK => lookupTablePK.columnName)
val countDistinctColumn = df.agg(countDistinct(lookupTablePKS.head, lookupTablePKS.tail: _*)).collect().map(_.getLong(0)).head
if (countColumn != countDistinctColumn) {
- throw new IllegalStateException(s"Failed to build lookup table ${lookupTableName} snapshot for Dup key found, key= ${lookupTablePKS}")
+ throw new IllegalStateException(s"Failed to build lookup table ${lookupTableName} snapshot for Dup key found, key= ${lookupTablePKS.mkString(",")}")
}
}
}
diff --git a/kylin-spark-project/kylin-spark-test/src/test/java/org/apache/kylin/engine/spark2/file_pruning/NFilePruningTest.java b/kylin-spark-project/kylin-spark-test/src/test/java/org/apache/kylin/engine/spark2/file_pruning/NFilePruningTest.java
index a64638b..f004168 100644
--- a/kylin-spark-project/kylin-spark-test/src/test/java/org/apache/kylin/engine/spark2/file_pruning/NFilePruningTest.java
+++ b/kylin-spark-project/kylin-spark-test/src/test/java/org/apache/kylin/engine/spark2/file_pruning/NFilePruningTest.java
@@ -63,10 +63,9 @@ import java.util.UUID;
public class NFilePruningTest extends LocalWithSparkSessionTest {
- private String SQL_BASE2 = "SELECT COUNT(*) FROM TEST_KYLIN_FACT LEFT JOIN TEST_ORDER ON TEST_KYLIN_FACT.ORDER_ID = TEST_ORDER.ORDER_ID ";
- private final static String CUBE_NAME = "file_pruning_cube2";
- private String SQL_BASE = "SELECT COUNT(*) FROM TEST_ORDER LEFT JOIN TEST_KYLIN_FACT ON TEST_KYLIN_FACT.ORDER_ID = TEST_ORDER.ORDER_ID ";
- private final static String CUBE_NAME2 = "file_pruning_cube";
+ private String SQL_BASE = "SELECT COUNT(*) FROM TEST_KYLIN_FACT LEFT JOIN TEST_ORDER ON TEST_KYLIN_FACT.ORDER_ID = TEST_ORDER.ORDER_ID ";
+ private final static String CUBE_SHARD_BY_SELLER_ID = "file_pruning_cube";
+ private final static String CUBE_PRUNER_BY_PARTITION = "file_pruning_cube2";
protected KylinConfig config;
protected CubeManager cubeMgr;
protected ExecutableManager execMgr;
@@ -128,11 +127,11 @@ public class NFilePruningTest extends LocalWithSparkSessionTest {
public void testNonExistTimeRange() throws Exception {
Long start = DateFormat.stringToMillis("2023-01-01 00:00:00");
Long end = DateFormat.stringToMillis("2025-01-01 00:00:00");
- cleanupSegments(CUBE_NAME);
- buildCuboid(CUBE_NAME, new SegmentRange.TSRange(start, end));
+ cleanupSegments(CUBE_PRUNER_BY_PARTITION);
+ buildCuboid(CUBE_PRUNER_BY_PARTITION, new SegmentRange.TSRange(start, end));
populateSSWithCSVData(config, getProject(), KylinSparkEnv.getSparkSession());
- assertResultsAndScanFiles(SQL_BASE2, 1);
+ assertResultsAndScanFiles(SQL_BASE, 1);
}
@Test
@@ -141,7 +140,7 @@ public class NFilePruningTest extends LocalWithSparkSessionTest {
// [2009-01-01 00:00:00, 2011-01-01 00:00:00)
// [2011-01-01 00:00:00, 2013-01-01 00:00:00)
// [2013-01-01 00:00:00, 2015-01-01 00:00:00)
- buildMultiSegs(CUBE_NAME);
+ buildMultiSegs(CUBE_PRUNER_BY_PARTITION);
populateSSWithCSVData(getTestConfig(), getProject(), SparderContext.getSparkSession());
testSegPruningWithStringDate();
testSegPruningWithStringTimeStamp();
@@ -161,28 +160,28 @@ public class NFilePruningTest extends LocalWithSparkSessionTest {
}
public void testSegPruningWithStringTimeStamp() throws Exception {
- String and_pruning0 = SQL_BASE2
+ String and_pruning0 = SQL_BASE
+ "where CAL_DT > '2011-01-01 00:00:00' and CAL_DT < '2013-01-01 00:00:00'";
- String and_pruning1 = SQL_BASE2
+ String and_pruning1 = SQL_BASE
+ "where CAL_DT > '2011-01-01 00:00:00' and CAL_DT = '2016-01-01 00:00:00'";
- String or_pruning0 = SQL_BASE2
+ String or_pruning0 = SQL_BASE
+ "where CAL_DT > '2011-01-01 00:00:00' or CAL_DT = '2016-01-01 00:00:00'";
- String or_pruning1 = SQL_BASE2
+ String or_pruning1 = SQL_BASE
+ "where CAL_DT < '2009-01-01 00:00:00' or CAL_DT > '2015-01-01 00:00:00'";
- String pruning0 = SQL_BASE2 + "where CAL_DT < '2009-01-01 00:00:00'";
- String pruning1 = SQL_BASE2 + "where CAL_DT <= '2009-01-01 00:00:00'";
- String pruning2 = SQL_BASE2 + "where CAL_DT >= '2015-01-01 00:00:00'";
+ String pruning0 = SQL_BASE + "where CAL_DT < '2009-01-01 00:00:00'";
+ String pruning1 = SQL_BASE + "where CAL_DT <= '2009-01-01 00:00:00'";
+ String pruning2 = SQL_BASE + "where CAL_DT >= '2015-01-01 00:00:00'";
- String not0 = SQL_BASE2 + "where CAL_DT <> '2012-01-01 00:00:00'";
+ String not0 = SQL_BASE + "where CAL_DT <> '2012-01-01 00:00:00'";
- String in_pruning0 = SQL_BASE2
+ String in_pruning0 = SQL_BASE
+ "where CAL_DT in ('2009-01-01 00:00:00', '2008-01-01 00:00:00', '2016-01-01 00:00:00')";
- String in_pruning1 = SQL_BASE2
+ String in_pruning1 = SQL_BASE
+ "where CAL_DT in ('2008-01-01 00:00:00', '2016-01-01 00:00:00')";
- assertResultsAndScanFiles(SQL_BASE2, 3);
+ assertResultsAndScanFiles(SQL_BASE, 3);
assertResultsAndScanFiles(and_pruning0, 1);
assertResultsAndScanFiles(and_pruning1, 0);
@@ -254,7 +253,7 @@ public class NFilePruningTest extends LocalWithSparkSessionTest {
public void testSegShardPruning() throws Exception {
System.setProperty("kylin.storage.columnar.shard-rowcount", "100");
try {
- buildMultiSegs(CUBE_NAME2);
+ buildMultiSegs(CUBE_SHARD_BY_SELLER_ID);
populateSSWithCSVData(getTestConfig(), getProject(), KylinSparkEnv.getSparkSession());
@@ -362,17 +361,17 @@ public class NFilePruningTest extends LocalWithSparkSessionTest {
String in = SQL_BASE + "where SELLER_ID in (10000233,10000234,10000235)";
String isNull = SQL_BASE + "where SELLER_ID is NULL";
String and = SQL_BASE + "where SELLER_ID in (10000233,10000234,10000235) and SELLER_ID = 10000233 ";
- String or = SQL_BASE + "where SELLER_ID = 10000233 or SELLER_ID = 1 ";
+ String or = SQL_BASE + "where SELLER_ID = 10000233 or SELLER_ID = 2 ";
String notSupported0 = SQL_BASE + "where SELLER_ID <> 10000233";
String notSupported1 = SQL_BASE + "where SELLER_ID > 10000233";
assertResultsAndScanFiles(equality, 3);
- assertResultsAndScanFiles(in, 9);
+ assertResultsAndScanFiles(in, 7);
assertResultsAndScanFiles(isNull, 3);
assertResultsAndScanFiles(and, 3);
- assertResultsAndScanFiles(or, 5); //5
- assertResultsAndScanFiles(notSupported0, 57); //36
- assertResultsAndScanFiles(notSupported1, 57); //36
+ assertResultsAndScanFiles(or, 5);
+ assertResultsAndScanFiles(notSupported0, 13);
+ assertResultsAndScanFiles(notSupported1, 13);
List<Pair<String, String>> query = new ArrayList<>();
query.add(Pair.newPair("", equality));