You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by ht...@apache.org on 2019/08/08 03:00:18 UTC
[asterixdb] branch master updated: [ASTERIXDB-2623][FUN] TPC-DS revert partitioning behavior back to default

This is an automated email from the ASF dual-hosted git repository.

htowaileb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git


The following commit(s) were added to refs/heads/master by this push:
     new 3c75c65  [ASTERIXDB-2623][FUN] TPC-DS revert partitioning behavior back to default
3c75c65 is described below

commit 3c75c653009d545443b9d52bdfaabdc645e94a80
Author: Hussain Towaileb <Hu...@Couchbase.com>
AuthorDate: Wed Aug 7 11:01:12 2019 +0300

    [ASTERIXDB-2623][FUN] TPC-DS revert partitioning behavior back to default
    
    - user model changes: no
    - storage format changes: no
    - interface changes: no
    
    Details:
    - To avoid unexpected behavior, this change reverts the partitioning
    logic back to the default behavior from the library.
    
    Change-Id: I0de179e33cd74dea333d163a108b1b7606b45643
    Reviewed-on: https://asterix-gerrit.ics.uci.edu/3516
    Sonar-Qube: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Contrib: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Integration-Tests: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Reviewed-by: Hussain Towaileb <hu...@gmail.com>
    Reviewed-by: Till Westmann <ti...@apache.org>
---
 .../app/function/TPCDSDataGeneratorReader.java     | 69 ++--------------------
 1 file changed, 6 insertions(+), 63 deletions(-)

diff --git a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/function/TPCDSDataGeneratorReader.java b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/function/TPCDSDataGeneratorReader.java
index 1122a29..3d08c01 100644
--- a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/function/TPCDSDataGeneratorReader.java
+++ b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/function/TPCDSDataGeneratorReader.java
@@ -36,12 +36,11 @@ import com.teradata.tpcds.Session;
 import com.teradata.tpcds.Table;
 
 /**
- * Each partition will be running a TPCDS data generator reader instance. Depending on the number of partitions, the
- * data generator will parallelize its work based on the number of partitions. The reader is passed the parallelism
- * level based on the number of partition instances.
+ * Each partition will be running a TPCDS data generator reader instance. The data generator will parallelize its work
+ * based on the number of partitions available. The reader is passed the parallelism level based on the number of
+ * partition instances.
  *
- * The function automatically handles generating the data for a single specified table or for all the tables. Also,
- * the parallelism will take place regardless of the selected data size to be generated.
+ * The function automatically handles generating the data for a single specified table or for all the tables.
  */
 
 public class TPCDSDataGeneratorReader extends FunctionReader {
@@ -86,7 +85,7 @@ public class TPCDSDataGeneratorReader extends FunctionReader {
 
         // Iterators for the tables to generate the data for
         for (Table table : selectedTables) {
-            Results result = calculateParallelism(table, session);
+            Results result = Results.constructResults(table, session);
             tableIterators.add(result.iterator());
         }
     }
@@ -134,7 +133,7 @@ public class TPCDSDataGeneratorReader extends FunctionReader {
     }
 
     /**
-     * Builds the string record from the generated values by the data generator. The column name for each value is
+     * Builds the string record from the generated values by the data generator. The field name for each value is
      * extracted from the table from which the data is being generated.
      *
      * @param values List containing all the generated column values
@@ -223,62 +222,6 @@ public class TPCDSDataGeneratorReader extends FunctionReader {
     }
 
     /**
-     * As the TPC-DS library has constraints on activating the parallelism (table must be generating 1,000,000 records
-     * based on a scaling factor), we're gonna override that behavior and calculate the parallelism manually. This
-     * will ensure the activation of the parallelism regardless of the data size being generated.
-     *
-     * @param table table to generate the data for
-     * @param session session containing the parallelism and scaling information
-     *
-     * @return Results that holds a lazy-iterator to generate the data based on the calculated parameters.
-     */
-    private Results calculateParallelism(Table table, Session session) {
-
-        // Total and parallelism level
-        long total = session.getScaling().getRowCount(table);
-        int parallelism = session.getParallelism();
-
-        // Row set size to be generated for each partition
-        long rowSetSize = total / parallelism;
-
-        // Special case: WEB_SITE table sometimes relies on the previous records, this could be a problem if the
-        // previous record is on a different thread. Since it's a small table, we'll generate it all on the first
-        // thread and let the other threads generate nothing
-        if (table.equals(Table.WEB_SITE)) {
-            if (session.getChunkNumber() - 1 == 0) {
-                return new Results(table, 1, total, session);
-            }
-            // Don't generate anything on other partition (start > end)
-            else {
-                return new Results(table, 2, 1, session);
-            }
-        }
-
-        // Special case: For very small tables, if the rowSetSize ends up being 1, this will cause an issue in the
-        // parallelism, so we'll just let the first thread do all the work
-        if (rowSetSize == 1) {
-            if (session.getChunkNumber() - 1 == 0) {
-                return new Results(table, 1, total, session);
-            }
-            // Don't generate anything on other partition (start > end)
-            else {
-                return new Results(table, 2, 1, session);
-            }
-        }
-
-        // Start and end calculated for each partition
-        long startRow = (session.getChunkNumber() - 1) * rowSetSize + 1;
-        long rowCount = startRow + rowSetSize - 1;
-
-        // Any extra rows (not evenly divided) will be done by the last partition
-        if (session.getChunkNumber() == parallelism) {
-            rowCount += total % parallelism;
-        }
-
-        return new Results(table, startRow, rowCount, session);
-    }
-
-    /**
      * Gets the function identifier
      *
      * @return function identifier