You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by "englefly (via GitHub)" <gi...@apache.org> on 2023/04/26 06:22:25 UTC

[GitHub] [doris] englefly commented on a diff in pull request #18971: [Optimization](statistics) optimize Incremental statistics collection and statistics cleaning

englefly commented on code in PR #18971:
URL: https://github.com/apache/doris/pull/18971#discussion_r1177396388


##########
regression-test/suites/statistics/incremental_stats_test.groovy:
##########
@@ -0,0 +1,292 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_incremental_stats") {
+    def dbName = "test_incremental_stats"
+    def tblName = "${dbName}.example_tbl"
+
+    def colStatisticsTblName = "__internal_schema.column_statistics"
+    def analysisJobsTblName = "__internal_schema.analysis_jobs"
+
+    def columnNames = """
+        (
+            `t_1682176142000_user_id`, `t_1682176142000_date`, 
+            `t_1682176142000_city`, `t_1682176142000_age`, `t_1682176142000_sex`, 
+            `t_1682176142000_last_visit_date`, `t_1682176142000_cost`, 
+            `t_1682176142000_max_dwell_time`, `t_1682176142000_min_dwell_time`
+        )
+    """
+
+    def columnNameValues = """
+        (
+            't_1682176142000_user_id', 't_1682176142000_date', 't_1682176142000_city', 
+            't_1682176142000_age', 't_1682176142000_sex', 't_1682176142000_last_visit_date', 
+            't_1682176142000_cost', 't_1682176142000_max_dwell_time', 't_1682176142000_min_dwell_time'
+        ) 
+    """
+
+    def query_col_statistics_with_order_sql = """
+        SELECT 
+            count, 
+            ndv, 
+            null_count, 
+            min, 
+            max, 
+            data_size_in_bytes 
+        FROM 
+            ${colStatisticsTblName} 
+        WHERE 
+            col_id IN ${columnNameValues}
+        ORDER BY 
+            col_id,
+            min, 
+            max,
+            count, 
+            ndv, 
+            null_count, 
+            data_size_in_bytes;
+    """
+
+    def query_analysis_jobs_count_sql = """
+        SELECT 
+            COUNT(*) 
+        FROM 
+            ${analysisJobsTblName} 
+        WHERE 
+            col_name IN ${columnNameValues};
+    """
+
+    sql """
+        DROP DATABASE IF EXISTS ${dbName};
+    """
+
+    sql """
+        CREATE DATABASE IF NOT EXISTS ${dbName};
+    """
+
+    sql """
+        DROP TABLE IF EXISTS ${tblName};
+    """
+
+    sql """
+        CREATE TABLE IF NOT EXISTS ${tblName} (
+            `t_1682176142000_user_id` LARGEINT NOT NULL,
+            `t_1682176142000_date` DATE NOT NULL,
+            `t_1682176142000_city` VARCHAR(20),
+            `t_1682176142000_age` SMALLINT,
+            `t_1682176142000_sex` TINYINT,
+            `t_1682176142000_last_visit_date` DATETIME REPLACE,
+            `t_1682176142000_cost` BIGINT SUM,
+            `t_1682176142000_max_dwell_time` INT MAX,
+            `t_1682176142000_min_dwell_time` INT MIN
+        ) ENGINE=OLAP
+        AGGREGATE KEY(`t_1682176142000_user_id`, `t_1682176142000_date`,
+         `t_1682176142000_city`, `t_1682176142000_age`, `t_1682176142000_sex`)
+        PARTITION BY LIST(`t_1682176142000_date`)
+        (
+            PARTITION `p_201701` VALUES IN ("2017-10-01"),
+            PARTITION `p_201702` VALUES IN ("2017-10-02"),
+            PARTITION `p_201703` VALUES IN ("2017-10-03")
+        )
+        DISTRIBUTED BY HASH(`t_1682176142000_user_id`) BUCKETS 1
+        PROPERTIES (
+            "replication_num" = "1"
+        );
+    """
+
+    sql """
+        INSERT INTO ${tblName} ${columnNames}
+        VALUES (10000, "2017-10-01", "北京", 20, 0, "2017-10-01 07:00:00", 15, 2, 2),
+            (10000, "2017-10-01", "北京", 20, 0, "2017-10-01 06:00:00", 20, 10, 10),
+            (10001, "2017-10-01", "北京", 30, 1, "2017-10-01 17:05:45", 2, 22, 22),
+            (10002, "2017-10-02", "上海", 20, 1, "2017-10-02 12:59:12", 200, 5, 5),
+            (10003, "2017-10-02", "广州", 32, 0, "2017-10-02 11:20:00", 30, 11, 11),
+            (10004, "2017-10-01", "深圳", 35, 0, "2017-10-01 10:00:15", 100, 3, 3),
+            (10004, "2017-10-03", "深圳", 35, 0, "2017-10-03 10:20:22", 11, 6, 6);
+    """
+
+    sql """  
+        DELETE FROM __internal_schema.column_statistics
+        WHERE col_id IN ${columnNameValues}  
+        """
+
+    // TODO delete by database name and table name
+    sql """
+        DELETE FROM __internal_schema.analysis_jobs WHERE job_id IS NOT NULL;
+        """
+
+    // 1. Firstly do a full collection of statistics
+    sql """
+        ANALYZE TABLE ${tblName} ${columnNames};
+    """
+
+     Thread.sleep(5000)
+
+    // Collecting all 9 columns will generate 9 tasks (total tasks: 9)
+    qt_sql query_analysis_jobs_count_sql
+
+    // Check the collected statistics
+    qt_sql query_col_statistics_with_order_sql
+
+    // Incrementally collect statistics
+    sql """
+        ANALYZE TABLE ${tblName} ${columnNames} WITH incremental;
+    """
+
+     Thread.sleep(5000)
+
+    // The table data has not changed, and no new tasks should be generated (total tasks:  9)
+    qt_sql query_analysis_jobs_count_sql
+
+    // Statistics won't change either
+    qt_sql query_col_statistics_with_order_sql

Review Comment:
   this check may fail. sleep(5000) can not guarantee this.



##########
regression-test/suites/statistics/incremental_stats_test.groovy:
##########
@@ -0,0 +1,292 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_incremental_stats") {
+    def dbName = "test_incremental_stats"
+    def tblName = "${dbName}.example_tbl"
+
+    def colStatisticsTblName = "__internal_schema.column_statistics"
+    def analysisJobsTblName = "__internal_schema.analysis_jobs"
+
+    def columnNames = """
+        (
+            `t_1682176142000_user_id`, `t_1682176142000_date`, 
+            `t_1682176142000_city`, `t_1682176142000_age`, `t_1682176142000_sex`, 
+            `t_1682176142000_last_visit_date`, `t_1682176142000_cost`, 
+            `t_1682176142000_max_dwell_time`, `t_1682176142000_min_dwell_time`
+        )
+    """
+
+    def columnNameValues = """
+        (
+            't_1682176142000_user_id', 't_1682176142000_date', 't_1682176142000_city', 
+            't_1682176142000_age', 't_1682176142000_sex', 't_1682176142000_last_visit_date', 
+            't_1682176142000_cost', 't_1682176142000_max_dwell_time', 't_1682176142000_min_dwell_time'
+        ) 
+    """
+
+    def query_col_statistics_with_order_sql = """
+        SELECT 
+            count, 
+            ndv, 
+            null_count, 
+            min, 
+            max, 
+            data_size_in_bytes 
+        FROM 
+            ${colStatisticsTblName} 
+        WHERE 
+            col_id IN ${columnNameValues}
+        ORDER BY 
+            col_id,
+            min, 
+            max,
+            count, 
+            ndv, 
+            null_count, 
+            data_size_in_bytes;
+    """
+
+    def query_analysis_jobs_count_sql = """
+        SELECT 
+            COUNT(*) 
+        FROM 
+            ${analysisJobsTblName} 
+        WHERE 
+            col_name IN ${columnNameValues};
+    """
+
+    sql """
+        DROP DATABASE IF EXISTS ${dbName};
+    """
+
+    sql """
+        CREATE DATABASE IF NOT EXISTS ${dbName};
+    """
+
+    sql """
+        DROP TABLE IF EXISTS ${tblName};
+    """
+
+    sql """
+        CREATE TABLE IF NOT EXISTS ${tblName} (
+            `t_1682176142000_user_id` LARGEINT NOT NULL,
+            `t_1682176142000_date` DATE NOT NULL,
+            `t_1682176142000_city` VARCHAR(20),
+            `t_1682176142000_age` SMALLINT,
+            `t_1682176142000_sex` TINYINT,
+            `t_1682176142000_last_visit_date` DATETIME REPLACE,
+            `t_1682176142000_cost` BIGINT SUM,
+            `t_1682176142000_max_dwell_time` INT MAX,
+            `t_1682176142000_min_dwell_time` INT MIN
+        ) ENGINE=OLAP
+        AGGREGATE KEY(`t_1682176142000_user_id`, `t_1682176142000_date`,
+         `t_1682176142000_city`, `t_1682176142000_age`, `t_1682176142000_sex`)
+        PARTITION BY LIST(`t_1682176142000_date`)
+        (
+            PARTITION `p_201701` VALUES IN ("2017-10-01"),
+            PARTITION `p_201702` VALUES IN ("2017-10-02"),
+            PARTITION `p_201703` VALUES IN ("2017-10-03")
+        )
+        DISTRIBUTED BY HASH(`t_1682176142000_user_id`) BUCKETS 1
+        PROPERTIES (
+            "replication_num" = "1"
+        );
+    """
+
+    sql """
+        INSERT INTO ${tblName} ${columnNames}
+        VALUES (10000, "2017-10-01", "北京", 20, 0, "2017-10-01 07:00:00", 15, 2, 2),
+            (10000, "2017-10-01", "北京", 20, 0, "2017-10-01 06:00:00", 20, 10, 10),
+            (10001, "2017-10-01", "北京", 30, 1, "2017-10-01 17:05:45", 2, 22, 22),
+            (10002, "2017-10-02", "上海", 20, 1, "2017-10-02 12:59:12", 200, 5, 5),
+            (10003, "2017-10-02", "广州", 32, 0, "2017-10-02 11:20:00", 30, 11, 11),
+            (10004, "2017-10-01", "深圳", 35, 0, "2017-10-01 10:00:15", 100, 3, 3),
+            (10004, "2017-10-03", "深圳", 35, 0, "2017-10-03 10:20:22", 11, 6, 6);
+    """
+
+    sql """  
+        DELETE FROM __internal_schema.column_statistics
+        WHERE col_id IN ${columnNameValues}  
+        """
+
+    // TODO delete by database name and table name
+    sql """
+        DELETE FROM __internal_schema.analysis_jobs WHERE job_id IS NOT NULL;
+        """
+
+    // 1. Firstly do a full collection of statistics
+    sql """
+        ANALYZE TABLE ${tblName} ${columnNames};
+    """
+
+     Thread.sleep(5000)
+
+    // Collecting all 9 columns will generate 9 tasks (total tasks: 9)
+    qt_sql query_analysis_jobs_count_sql

Review Comment:
   this check may fail. sleep(5000) can not guarantee this.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org