You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kylin.apache.org by xx...@apache.org on 2020/12/18 08:57:32 UTC

[kylin] branch kylin-on-parquet-v2 updated: KYLIN-4845 Fix NFilePruningTest report dup key error

This is an automated email from the ASF dual-hosted git repository.

xxyu pushed a commit to branch kylin-on-parquet-v2
in repository https://gitbox.apache.org/repos/asf/kylin.git


The following commit(s) were added to refs/heads/kylin-on-parquet-v2 by this push:
     new 49dcdd2  KYLIN-4845 Fix NFilePruningTest report dup key error
49dcdd2 is described below

commit 49dcdd270a0a014123e4ba1b586c49e6dd508540
Author: yaqian.zhang <59...@qq.com>
AuthorDate: Fri Dec 18 15:31:49 2020 +0800

    KYLIN-4845 Fix NFilePruningTest report dup key error
---
 .../file_prunning/cube_desc/file_pruning_cube.json | 61 +++++++++-------------
 .../cube_desc/file_pruning_cube2.json              |  2 +-
 .../model_desc/file_pruning_model.json             | 22 ++++----
 .../model_desc/file_pruning_model2.json            | 40 --------------
 .../file_prunning/project/default.json             |  2 +-
 .../engine/spark/builder/CubeSnapshotBuilder.scala |  2 +-
 .../spark2/file_pruning/NFilePruningTest.java      | 49 +++++++++--------
 7 files changed, 62 insertions(+), 116 deletions(-)

diff --git a/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube.json b/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube.json
index 01581d9..0ec4904 100644
--- a/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube.json
+++ b/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube.json
@@ -1,6 +1,6 @@
 {
   "uuid" : "330b1839-1baf-5e4f-7f4c-ad173a5217c8",
-  "last_modified" : 1589194835241,
+  "last_modified" : 1589268257682,
   "version" : "3.9.9.1",
   "name" : "file_pruning_cube",
   "is_draft" : false,
@@ -9,29 +9,14 @@
   "null_string" : null,
   "dimensions" : [ {
     "name" : "ORDER_ID",
-    "table" : "TEST_ORDER",
+    "table" : "TEST_KYLIN_FACT",
     "column" : "ORDER_ID",
     "derived" : null
   }, {
-    "name" : "BUYER_ID",
-    "table" : "TEST_ORDER",
-    "column" : "BUYER_ID",
-    "derived" : null
-  }, {
-    "name" : "TEST_DATE_ENC",
-    "table" : "TEST_ORDER",
-    "column" : "TEST_DATE_ENC",
-    "derived" : null
-  }, {
-    "name" : "TEST_TIME_ENC",
-    "table" : "TEST_ORDER",
-    "column" : "TEST_TIME_ENC",
-    "derived" : null
-  }, {
-    "name" : "ORDER_ID",
+    "name" : "CAL_DT",
     "table" : "TEST_KYLIN_FACT",
-    "column" : null,
-    "derived" : [ "ORDER_ID" ]
+    "column" : "CAL_DT",
+    "derived" : null
   }, {
     "name" : "LSTG_FORMAT_NAME",
     "table" : "TEST_KYLIN_FACT",
@@ -52,6 +37,16 @@
     "table" : "TEST_KYLIN_FACT",
     "column" : "PRICE",
     "derived" : null
+  }, {
+    "name" : "ORDER_ID",
+    "table" : "TEST_ORDER",
+    "column" : null,
+    "derived" : [ "ORDER_ID" ]
+  }, {
+    "name" : "TEST_DATE_ENC",
+    "table" : "TEST_ORDER",
+    "column" : null,
+    "derived" : [ "TEST_DATE_ENC" ]
   } ],
   "measures" : [ {
     "name" : "_COUNT_",
@@ -67,20 +62,20 @@
   "dictionaries" : [ ],
   "rowkey" : {
     "rowkey_columns" : [ {
-      "column" : "TEST_ORDER.ORDER_ID",
+      "column" : "TEST_KYLIN_FACT.ORDER_ID",
       "encoding" : "dict",
       "isShardBy" : false
     }, {
-      "column" : "TEST_ORDER.BUYER_ID",
-      "encoding" : "dict",
+      "column" : "TEST_KYLIN_FACT.CAL_DT",
+      "encoding" : "date",
       "isShardBy" : false
     }, {
-      "column" : "TEST_ORDER.TEST_DATE_ENC",
-      "encoding" : "date",
+      "column" : "TEST_KYLIN_FACT.LSTG_FORMAT_NAME",
+      "encoding" : "dict",
       "isShardBy" : false
     }, {
-      "column" : "TEST_ORDER.TEST_TIME_ENC",
-      "encoding" : "time",
+      "column" : "TEST_KYLIN_FACT.SLR_SEGMENT_CD",
+      "encoding" : "dict",
       "isShardBy" : false
     }, {
       "column" : "TEST_KYLIN_FACT.SELLER_ID",
@@ -90,14 +85,6 @@
       "column" : "TEST_KYLIN_FACT.PRICE",
       "encoding" : "dict",
       "isShardBy" : false
-    }, {
-      "column" : "TEST_KYLIN_FACT.SLR_SEGMENT_CD",
-      "encoding" : "dict",
-      "isShardBy" : false
-    } , {
-      "column" : "TEST_KYLIN_FACT.LSTG_FORMAT_NAME",
-      "encoding" : "dict",
-      "isShardBy" : false
     } ]
   },
   "hbase_mapping" : {
@@ -110,14 +97,14 @@
     } ]
   },
   "aggregation_groups" : [ {
-    "includes" : [ "TEST_ORDER.ORDER_ID", "TEST_ORDER.BUYER_ID", "TEST_ORDER.TEST_DATE_ENC", "TEST_ORDER.TEST_TIME_ENC" ],
+    "includes" : [ "TEST_KYLIN_FACT.ORDER_ID", "TEST_KYLIN_FACT.CAL_DT", "TEST_KYLIN_FACT.LSTG_FORMAT_NAME", "TEST_KYLIN_FACT.SLR_SEGMENT_CD", "TEST_KYLIN_FACT.SELLER_ID", "TEST_KYLIN_FACT.PRICE" ],
     "select_rule" : {
       "hierarchy_dims" : [ ],
       "mandatory_dims" : [ ],
       "joint_dims" : [ ]
     }
   } ],
-  "signature" : "2BsFyXxmB2F+FNlH4RxDwQ==",
+  "signature" : "ioyeFiil5XMOa8G02uVhgQ==",
   "notify_list" : [ ],
   "status_need_notify" : [ "ERROR", "DISCARDED", "SUCCEED" ],
   "partition_date_start" : 0,
diff --git a/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube2.json b/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube2.json
index f9fa85b..8e2bd3a 100644
--- a/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube2.json
+++ b/examples/test_case_data/file_prunning/cube_desc/file_pruning_cube2.json
@@ -4,7 +4,7 @@
   "version" : "3.9.9.1",
   "name" : "file_pruning_cube2",
   "is_draft" : false,
-  "model_name" : "file_pruning_model2",
+  "model_name" : "file_pruning_model",
   "description" : "",
   "null_string" : null,
   "dimensions" : [ {
diff --git a/examples/test_case_data/file_prunning/model_desc/file_pruning_model.json b/examples/test_case_data/file_prunning/model_desc/file_pruning_model.json
index fb35bd8..018a384 100644
--- a/examples/test_case_data/file_prunning/model_desc/file_pruning_model.json
+++ b/examples/test_case_data/file_prunning/model_desc/file_pruning_model.json
@@ -1,33 +1,33 @@
 {
   "uuid" : "1433c51f-fa34-3c12-6d10-d4a59338a19d",
-  "last_modified" : 1589194803840,
+  "last_modified" : 0,
   "version" : "3.9.9.1",
   "name" : "file_pruning_model",
   "owner" : "ADMIN",
   "is_draft" : false,
   "description" : "",
-  "fact_table" : "DEFAULT.TEST_ORDER",
+  "fact_table" : "DEFAULT.TEST_KYLIN_FACT",
   "lookups" : [ {
-    "table" : "DEFAULT.TEST_KYLIN_FACT",
+    "table" : "DEFAULT.TEST_ORDER",
     "kind" : "LOOKUP",
-    "alias" : "TEST_KYLIN_FACT",
+    "alias" : "TEST_ORDER",
     "join" : {
       "type" : "left",
-      "primary_key" : [ "TEST_KYLIN_FACT.ORDER_ID" ],
-      "foreign_key" : [ "TEST_ORDER.ORDER_ID" ]
+      "primary_key" : [ "TEST_ORDER.ORDER_ID" ],
+      "foreign_key" : [ "TEST_KYLIN_FACT.ORDER_ID" ]
     }
   } ],
   "dimensions" : [ {
-    "table" : "TEST_ORDER",
-    "columns" : [ "ORDER_ID", "BUYER_ID", "TEST_DATE_ENC", "TEST_TIME_ENC" ]
-  }, {
     "table" : "TEST_KYLIN_FACT",
-    "columns" : [ "SELLER_ID", "PRICE", "SLR_SEGMENT_CD", "LSTG_FORMAT_NAME", "ORDER_ID" ]
+    "columns" : [ "SELLER_ID", "PRICE", "SLR_SEGMENT_CD", "LSTG_FORMAT_NAME", "CAL_DT", "ORDER_ID" ]
+  }, {
+    "table" : "TEST_ORDER",
+    "columns" : [ "ORDER_ID", "TEST_DATE_ENC" ]
   } ],
   "metrics" : [ ],
   "filter_condition" : "",
   "partition_desc" : {
-    "partition_date_column" : "TEST_ORDER.TEST_TIME_ENC",
+    "partition_date_column" : "TEST_KYLIN_FACT.CAL_DT",
     "partition_time_column" : null,
     "partition_date_start" : 0,
     "partition_date_format" : "yyyy-MM-dd",
diff --git a/examples/test_case_data/file_prunning/model_desc/file_pruning_model2.json b/examples/test_case_data/file_prunning/model_desc/file_pruning_model2.json
deleted file mode 100644
index 2f29fc6..0000000
--- a/examples/test_case_data/file_prunning/model_desc/file_pruning_model2.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-  "uuid" : "7622b8c5-a5fb-e10b-4038-4cbccfc2df24",
-  "last_modified" : 1589268019470,
-  "version" : "3.9.9.1",
-  "name" : "file_pruning_model2",
-  "owner" : "ADMIN",
-  "is_draft" : false,
-  "description" : "",
-  "fact_table" : "DEFAULT.TEST_KYLIN_FACT",
-  "lookups" : [ {
-    "table" : "DEFAULT.TEST_ORDER",
-    "kind" : "LOOKUP",
-    "alias" : "TEST_ORDER",
-    "join" : {
-      "type" : "left",
-      "primary_key" : [ "TEST_ORDER.ORDER_ID" ],
-      "foreign_key" : [ "TEST_KYLIN_FACT.ORDER_ID" ]
-    }
-  } ],
-  "dimensions" : [ {
-    "table" : "TEST_KYLIN_FACT",
-    "columns" : [ "SELLER_ID", "PRICE", "SLR_SEGMENT_CD", "LSTG_FORMAT_NAME", "CAL_DT", "ORDER_ID" ]
-  }, {
-    "table" : "TEST_ORDER",
-    "columns" : [ "ORDER_ID", "TEST_DATE_ENC" ]
-  } ],
-  "metrics" : [ ],
-  "filter_condition" : "",
-  "partition_desc" : {
-    "partition_date_column" : "TEST_KYLIN_FACT.CAL_DT",
-    "partition_time_column" : null,
-    "partition_date_start" : 0,
-    "partition_date_format" : "yyyy-MM-dd",
-    "partition_time_format" : "HH:mm:ss",
-    "partition_type" : "APPEND",
-    "partition_condition_builder" : "org.apache.kylin.metadata.model.PartitionDesc$DefaultPartitionConditionBuilder"
-  },
-  "capacity" : "MEDIUM",
-  "projectName" : "default"
-}
\ No newline at end of file
diff --git a/examples/test_case_data/file_prunning/project/default.json b/examples/test_case_data/file_prunning/project/default.json
index 9ebc29f..e983b5d 100644
--- a/examples/test_case_data/file_prunning/project/default.json
+++ b/examples/test_case_data/file_prunning/project/default.json
@@ -19,7 +19,7 @@
     "type" : "CUBE",
     "realization" : "file_pruning_cube_measure"
   } ],
-  "models" : [ "file_pruning_model","file_pruning_model2","file_pruning_model_measure" ],
+  "models" : [ "file_pruning_model","file_pruning_model_measure" ],
   "ext_filters" : [ ],
   "override_kylin_properties" : { }
 }
\ No newline at end of file
diff --git a/kylin-spark-project/kylin-spark-engine/src/main/scala/org/apache/kylin/engine/spark/builder/CubeSnapshotBuilder.scala b/kylin-spark-project/kylin-spark-engine/src/main/scala/org/apache/kylin/engine/spark/builder/CubeSnapshotBuilder.scala
index d1b62f0..146dcab 100644
--- a/kylin-spark-project/kylin-spark-engine/src/main/scala/org/apache/kylin/engine/spark/builder/CubeSnapshotBuilder.scala
+++ b/kylin-spark-project/kylin-spark-engine/src/main/scala/org/apache/kylin/engine/spark/builder/CubeSnapshotBuilder.scala
@@ -195,7 +195,7 @@ class CubeSnapshotBuilder extends Logging {
         val lookupTablePKS = joinDesc.PKS.map(lookupTablePK => lookupTablePK.columnName)
         val countDistinctColumn = df.agg(countDistinct(lookupTablePKS.head, lookupTablePKS.tail: _*)).collect().map(_.getLong(0)).head
         if (countColumn != countDistinctColumn) {
-          throw new IllegalStateException(s"Failed to build lookup table ${lookupTableName} snapshot for Dup key found, key= ${lookupTablePKS}")
+          throw new IllegalStateException(s"Failed to build lookup table ${lookupTableName} snapshot for Dup key found, key= ${lookupTablePKS.mkString(",")}")
         }
     }
   }
diff --git a/kylin-spark-project/kylin-spark-test/src/test/java/org/apache/kylin/engine/spark2/file_pruning/NFilePruningTest.java b/kylin-spark-project/kylin-spark-test/src/test/java/org/apache/kylin/engine/spark2/file_pruning/NFilePruningTest.java
index a64638b..f004168 100644
--- a/kylin-spark-project/kylin-spark-test/src/test/java/org/apache/kylin/engine/spark2/file_pruning/NFilePruningTest.java
+++ b/kylin-spark-project/kylin-spark-test/src/test/java/org/apache/kylin/engine/spark2/file_pruning/NFilePruningTest.java
@@ -63,10 +63,9 @@ import java.util.UUID;
 
 public class NFilePruningTest extends LocalWithSparkSessionTest {
 
-    private String SQL_BASE2 = "SELECT COUNT(*)  FROM TEST_KYLIN_FACT LEFT JOIN TEST_ORDER ON TEST_KYLIN_FACT.ORDER_ID = TEST_ORDER.ORDER_ID ";
-    private final static String CUBE_NAME = "file_pruning_cube2";
-    private String SQL_BASE = "SELECT COUNT(*)  FROM TEST_ORDER LEFT JOIN TEST_KYLIN_FACT ON TEST_KYLIN_FACT.ORDER_ID = TEST_ORDER.ORDER_ID ";
-    private final static String CUBE_NAME2 = "file_pruning_cube";
+    private String SQL_BASE = "SELECT COUNT(*)  FROM TEST_KYLIN_FACT LEFT JOIN TEST_ORDER ON TEST_KYLIN_FACT.ORDER_ID = TEST_ORDER.ORDER_ID ";
+    private final static String CUBE_SHARD_BY_SELLER_ID = "file_pruning_cube";
+    private final static String CUBE_PRUNER_BY_PARTITION = "file_pruning_cube2";
     protected KylinConfig config;
     protected CubeManager cubeMgr;
     protected ExecutableManager execMgr;
@@ -128,11 +127,11 @@ public class NFilePruningTest extends LocalWithSparkSessionTest {
     public void testNonExistTimeRange() throws Exception {
         Long start = DateFormat.stringToMillis("2023-01-01 00:00:00");
         Long end = DateFormat.stringToMillis("2025-01-01 00:00:00");
-        cleanupSegments(CUBE_NAME);
-        buildCuboid(CUBE_NAME, new SegmentRange.TSRange(start, end));
+        cleanupSegments(CUBE_PRUNER_BY_PARTITION);
+        buildCuboid(CUBE_PRUNER_BY_PARTITION, new SegmentRange.TSRange(start, end));
 
         populateSSWithCSVData(config, getProject(), KylinSparkEnv.getSparkSession());
-        assertResultsAndScanFiles(SQL_BASE2, 1);
+        assertResultsAndScanFiles(SQL_BASE, 1);
     }
 
     @Test
@@ -141,7 +140,7 @@ public class NFilePruningTest extends LocalWithSparkSessionTest {
         // [2009-01-01 00:00:00, 2011-01-01 00:00:00)
         // [2011-01-01 00:00:00, 2013-01-01 00:00:00)
         // [2013-01-01 00:00:00, 2015-01-01 00:00:00)
-        buildMultiSegs(CUBE_NAME);
+        buildMultiSegs(CUBE_PRUNER_BY_PARTITION);
         populateSSWithCSVData(getTestConfig(), getProject(), SparderContext.getSparkSession());
         testSegPruningWithStringDate();
         testSegPruningWithStringTimeStamp();
@@ -161,28 +160,28 @@ public class NFilePruningTest extends LocalWithSparkSessionTest {
     }
 
     public void testSegPruningWithStringTimeStamp() throws Exception {
-        String and_pruning0 = SQL_BASE2
+        String and_pruning0 = SQL_BASE
                 + "where CAL_DT > '2011-01-01 00:00:00' and CAL_DT < '2013-01-01 00:00:00'";
-        String and_pruning1 = SQL_BASE2
+        String and_pruning1 = SQL_BASE
                 + "where CAL_DT > '2011-01-01 00:00:00' and CAL_DT = '2016-01-01 00:00:00'";
 
-        String or_pruning0 = SQL_BASE2
+        String or_pruning0 = SQL_BASE
                 + "where CAL_DT > '2011-01-01 00:00:00' or CAL_DT = '2016-01-01 00:00:00'";
-        String or_pruning1 = SQL_BASE2
+        String or_pruning1 = SQL_BASE
                 + "where CAL_DT < '2009-01-01 00:00:00' or CAL_DT > '2015-01-01 00:00:00'";
 
-        String pruning0 = SQL_BASE2 + "where CAL_DT < '2009-01-01 00:00:00'";
-        String pruning1 = SQL_BASE2 + "where CAL_DT <= '2009-01-01 00:00:00'";
-        String pruning2 = SQL_BASE2 + "where CAL_DT >= '2015-01-01 00:00:00'";
+        String pruning0 = SQL_BASE + "where CAL_DT < '2009-01-01 00:00:00'";
+        String pruning1 = SQL_BASE + "where CAL_DT <= '2009-01-01 00:00:00'";
+        String pruning2 = SQL_BASE + "where CAL_DT >= '2015-01-01 00:00:00'";
 
-        String not0 = SQL_BASE2 + "where CAL_DT <> '2012-01-01 00:00:00'";
+        String not0 = SQL_BASE + "where CAL_DT <> '2012-01-01 00:00:00'";
 
-        String in_pruning0 = SQL_BASE2
+        String in_pruning0 = SQL_BASE
                 + "where CAL_DT in ('2009-01-01 00:00:00', '2008-01-01 00:00:00', '2016-01-01 00:00:00')";
-        String in_pruning1 = SQL_BASE2
+        String in_pruning1 = SQL_BASE
                 + "where CAL_DT in ('2008-01-01 00:00:00', '2016-01-01 00:00:00')";
 
-        assertResultsAndScanFiles(SQL_BASE2, 3);
+        assertResultsAndScanFiles(SQL_BASE, 3);
 
         assertResultsAndScanFiles(and_pruning0, 1);
         assertResultsAndScanFiles(and_pruning1, 0);
@@ -254,7 +253,7 @@ public class NFilePruningTest extends LocalWithSparkSessionTest {
     public void testSegShardPruning() throws Exception {
         System.setProperty("kylin.storage.columnar.shard-rowcount", "100");
         try {
-            buildMultiSegs(CUBE_NAME2);
+            buildMultiSegs(CUBE_SHARD_BY_SELLER_ID);
 
             populateSSWithCSVData(getTestConfig(), getProject(), KylinSparkEnv.getSparkSession());
 
@@ -362,17 +361,17 @@ public class NFilePruningTest extends LocalWithSparkSessionTest {
         String in = SQL_BASE + "where SELLER_ID in (10000233,10000234,10000235)";
         String isNull = SQL_BASE + "where SELLER_ID is NULL";
         String and = SQL_BASE + "where SELLER_ID in (10000233,10000234,10000235) and SELLER_ID = 10000233 ";
-        String or = SQL_BASE + "where SELLER_ID = 10000233 or SELLER_ID = 1 ";
+        String or = SQL_BASE + "where SELLER_ID = 10000233 or SELLER_ID = 2 ";
         String notSupported0 = SQL_BASE + "where SELLER_ID <> 10000233";
         String notSupported1 = SQL_BASE + "where SELLER_ID > 10000233";
 
         assertResultsAndScanFiles(equality, 3);
-        assertResultsAndScanFiles(in, 9);
+        assertResultsAndScanFiles(in, 7);
         assertResultsAndScanFiles(isNull, 3);
         assertResultsAndScanFiles(and, 3);
-        assertResultsAndScanFiles(or, 5); //5
-        assertResultsAndScanFiles(notSupported0, 57); //36
-        assertResultsAndScanFiles(notSupported1, 57);  //36
+        assertResultsAndScanFiles(or, 5);
+        assertResultsAndScanFiles(notSupported0, 13);
+        assertResultsAndScanFiles(notSupported1, 13);
 
         List<Pair<String, String>> query = new ArrayList<>();
         query.add(Pair.newPair("", equality));