You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@carbondata.apache.org by ra...@apache.org on 2018/03/03 12:43:48 UTC

[01/25] carbondata git commit: [HOTFIX] Fix documentation error

Repository: carbondata
Updated Branches:
  refs/heads/branch-1.3 5b44e8105 -> ba5a70adb


[HOTFIX] Fix documentation error

Fix documentation error

This closes #1931


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/b58de09b
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/b58de09b
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/b58de09b

Branch: refs/heads/branch-1.3
Commit: b58de09b743de2ac9f2862744653eca61aebb3cf
Parents: 5b44e81
Author: Raghunandan S <ca...@gmail.com>
Authored: Mon Feb 5 14:28:58 2018 +0530
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 17:39:21 2018 +0530

----------------------------------------------------------------------
 docs/configuration-parameters.md      |  2 +-
 docs/data-management-on-carbondata.md | 11 -----------
 2 files changed, 1 insertion(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/b58de09b/docs/configuration-parameters.md
----------------------------------------------------------------------
diff --git a/docs/configuration-parameters.md b/docs/configuration-parameters.md
index 91f6cf5..7221420 100644
--- a/docs/configuration-parameters.md
+++ b/docs/configuration-parameters.md
@@ -74,7 +74,7 @@ This section provides the details of all the configurations required for CarbonD
 | carbon.horizontal.UPDATE.compaction.threshold | 1 | This property specifies the threshold limit on number of UPDATE delta files within a segment. In case the number of delta files goes beyond the threshold, the UPDATE delta files within the segment becomes eligible for horizontal compaction and compacted into single UPDATE delta file. | Values between 1 to 10000. |
 | carbon.horizontal.DELETE.compaction.threshold | 1 | This property specifies the threshold limit on number of DELETE delta files within a block of a segment. In case the number of delta files goes beyond the threshold, the DELETE delta files for the particular block of the segment becomes eligible for horizontal compaction and compacted into single DELETE delta file. | Values between 1 to 10000. |
 | carbon.update.segment.parallelism | 1 | This property specifies the parallelism for each segment during update. If there are segments that contain too many records to update and the spark job encounter data-spill related errors, it is better to increase this property value. It is recommended to set this value to a multiple of the number of executors for balance. | Values between 1 to 1000. |
-| carbon.merge.index.in.segment | true | This property is used to merge all CarbonData index files (.carbonindex) inside a segment to a sinle CarbonData index merge file (.carbonindexmerge).| Values true or false |  
+  
 
 * **Query Configuration**
   

http://git-wip-us.apache.org/repos/asf/carbondata/blob/b58de09b/docs/data-management-on-carbondata.md
----------------------------------------------------------------------
diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md
index 9bb6c20..18ad5b8 100644
--- a/docs/data-management-on-carbondata.md
+++ b/docs/data-management-on-carbondata.md
@@ -265,17 +265,6 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
      ```
      ALTER TABLE test_db.carbon CHANGE a1 a1 DECIMAL(18,2)
      ```
-- **MERGE INDEX**
-   
-     This command is used to merge all the CarbonData index files (.carbonindex) inside a segment to a single CarbonData index merge file (.carbonindexmerge). This enhances the first query performance.
-     ```
-      ALTER TABLE [db_name.]table_name COMPACT 'SEGMENT_INDEX'
-      ```
-      
-      Examples:
-      ```
-      ALTER TABLE test_db.carbon COMPACT 'SEGMENT_INDEX'
-      ```
 
 ### DROP TABLE

[22/25] carbondata git commit: [HOTFIX] Fixed all examples

Posted by ra...@apache.org.

[HOTFIX] Fixed all examples

Fixed all examples

This closes #2024


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/5b0b503f
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/5b0b503f
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/5b0b503f

Branch: refs/heads/branch-1.3
Commit: 5b0b503f7338fa628c5ebbb7d74a393e98881a1f
Parents: f1a73bd
Author: ravipesala <ra...@gmail.com>
Authored: Fri Mar 2 21:46:50 2018 +0530
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 18:05:56 2018 +0530

----------------------------------------------------------------------
 .../apache/carbondata/examples/CarbonDataFrameExample.scala  | 8 ++++----
 .../carbondata/examples/CarbonSortColumnsExample.scala       | 6 ++----
 .../spark/testsuite/dataload/TestLoadDataGeneral.scala       | 1 +
 .../sql/execution/strategy/CarbonLateDecodeStrategy.scala    | 5 ++++-
 .../org/apache/spark/sql/parser/CarbonSpark2SqlParser.scala  | 2 +-
 5 files changed, 12 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/5b0b503f/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonDataFrameExample.scala
----------------------------------------------------------------------
diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonDataFrameExample.scala b/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonDataFrameExample.scala
index fe15659..c8f8023 100644
--- a/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonDataFrameExample.scala
+++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonDataFrameExample.scala
@@ -54,13 +54,13 @@ object CarbonDataFrameExample {
     // Saves dataframe to carbondata file
     df.write
       .format("carbondata")
-      .option("tableName", "carbon_table")
+      .option("tableName", "carbon_df_table")
       .option("compress", "true")
       .option("tempCSV", "false")
       .mode(SaveMode.Overwrite)
       .save()
 
-    spark.sql(""" SELECT * FROM carbon_table """).show()
+    spark.sql(""" SELECT * FROM carbon_df_table """).show()
 
     // Specify schema
     import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
@@ -74,7 +74,7 @@ object CarbonDataFrameExample {
       .format("carbondata")
       .schema(customSchema)
       // .option("dbname", "db_name") the system will use "default" as dbname if not set this option
-      .option("tableName", "carbon_table")
+      .option("tableName", "carbon_df_table")
       .load()
 
     // Dataframe operations
@@ -82,7 +82,7 @@ object CarbonDataFrameExample {
     carbondf.select($"c1", $"number" + 10).show()
     carbondf.filter($"number" > 31).show()
 
-    spark.sql("DROP TABLE IF EXISTS carbon_table")
+    spark.sql("DROP TABLE IF EXISTS carbon_df_table")
 
     spark.stop()
   }

http://git-wip-us.apache.org/repos/asf/carbondata/blob/5b0b503f/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonSortColumnsExample.scala
----------------------------------------------------------------------
diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonSortColumnsExample.scala b/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonSortColumnsExample.scala
index 3a9f26b..8d0eabf 100644
--- a/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonSortColumnsExample.scala
+++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonSortColumnsExample.scala
@@ -99,15 +99,13 @@ object CarbonSortColumnsExample {
       s"""
          | LOAD DATA LOCAL INPATH '$path'
          | INTO TABLE no_sort_columns_table
-         | OPTIONS('FILEHEADER'='shortField,intField,bigintField,doubleField,stringField,timestampField,decimalField,dateField,charField,floatField,complexData',
-         | 'COMPLEX_DELIMITER_LEVEL_1'='#')
+         | OPTIONS('COMPLEX_DELIMITER_LEVEL_1'='#')
        """.stripMargin)
     spark.sql(
       s"""
          | LOAD DATA LOCAL INPATH '$path'
          | INTO TABLE sort_columns_table
-         | OPTIONS('FILEHEADER'='shortField,intField,bigintField,doubleField,stringField,timestampField,decimalField,dateField,charField,floatField,complexData',
-         | 'COMPLEX_DELIMITER_LEVEL_1'='#')
+         | OPTIONS('COMPLEX_DELIMITER_LEVEL_1'='#')
        """.stripMargin)
     // scalastyle:on
 

http://git-wip-us.apache.org/repos/asf/carbondata/blob/5b0b503f/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestLoadDataGeneral.scala
----------------------------------------------------------------------
diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestLoadDataGeneral.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestLoadDataGeneral.scala
index 09ca9e5..ec4e143 100644
--- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestLoadDataGeneral.scala
+++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestLoadDataGeneral.scala
@@ -177,6 +177,7 @@ class TestLoadDataGeneral extends QueryTest with BeforeAndAfterEach {
     intercept[Exception] {
       sql("insert into load32000chardata_dup select dim1,concat(load32000chardata.dim2,'aaaa'),mes1 from load32000chardata").show()
     }
+    sql(s"LOAD DATA LOCAL INPATH '$testdata' into table load32000chardata_dup OPTIONS('FILEHEADER'='dim1,dim2,mes1')")
     intercept[Exception] {
       sql("update load32000chardata_dup set(load32000chardata_dup.dim2)=(select concat(load32000chardata.dim2,'aaaa') from load32000chardata)").show()
     }

http://git-wip-us.apache.org/repos/asf/carbondata/blob/5b0b503f/integration/spark2/src/main/scala/org/apache/spark/sql/execution/strategy/CarbonLateDecodeStrategy.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/strategy/CarbonLateDecodeStrategy.scala b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/strategy/CarbonLateDecodeStrategy.scala
index 48679b1..668c4cc 100644
--- a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/strategy/CarbonLateDecodeStrategy.scala
+++ b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/strategy/CarbonLateDecodeStrategy.scala
@@ -145,7 +145,10 @@ private[sql] class CarbonLateDecodeStrategy extends SparkStrategy {
       filterPredicates: Seq[Expression],
       scanBuilder: (Seq[Attribute], Array[Filter],
         ArrayBuffer[AttributeReference], Seq[PartitionSpec]) => RDD[InternalRow]) = {
-    val names = relation.catalogTable.get.partitionColumnNames
+    val names = relation.catalogTable match {
+      case Some(table) => table.partitionColumnNames
+      case _ => Seq.empty
+    }
     // Get the current partitions from table.
     var partitions: Seq[PartitionSpec] = null
     if (names.nonEmpty) {

http://git-wip-us.apache.org/repos/asf/carbondata/blob/5b0b503f/integration/spark2/src/main/scala/org/apache/spark/sql/parser/CarbonSpark2SqlParser.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/main/scala/org/apache/spark/sql/parser/CarbonSpark2SqlParser.scala b/integration/spark2/src/main/scala/org/apache/spark/sql/parser/CarbonSpark2SqlParser.scala
index 7addd26..86790ba 100644
--- a/integration/spark2/src/main/scala/org/apache/spark/sql/parser/CarbonSpark2SqlParser.scala
+++ b/integration/spark2/src/main/scala/org/apache/spark/sql/parser/CarbonSpark2SqlParser.scala
@@ -227,7 +227,7 @@ class CarbonSpark2SqlParser extends CarbonDDLSqlParser {
             }
 
           } else {
-            (sel, updateRelation(tab._1, tab._2, tab._4, Some(tab._3.get)))
+            (sel, updateRelation(tab._1, tab._2, tab._4, tab._3))
           }
         val rel = tab._3 match {
           case Some(a) => UpdateTable(relation, columns, selectStmt, Some(tab._3.get), where)

[05/25] carbondata git commit: [HOTFIX] Fix documentation errors

Posted by ra...@apache.org.

[HOTFIX] Fix documentation errors

Fix documentation errors

This closes #1955


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/6c25d240
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/6c25d240
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/6c25d240

Branch: refs/heads/branch-1.3
Commit: 6c25d24068263b6d7e669cedde22890ff4a2d463
Parents: 4033f4c
Author: Raghunandan S <ca...@gmail.com>
Authored: Thu Feb 8 21:30:03 2018 +0530
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 17:45:47 2018 +0530

----------------------------------------------------------------------
 docs/data-management-on-carbondata.md | 76 +++++++++++++-----------------
 1 file changed, 34 insertions(+), 42 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/6c25d240/docs/data-management-on-carbondata.md
----------------------------------------------------------------------
diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md
index 61bb356..f70e0b7 100644
--- a/docs/data-management-on-carbondata.md
+++ b/docs/data-management-on-carbondata.md
@@ -39,7 +39,7 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
   STORED BY 'carbondata'
   [TBLPROPERTIES (property_name=property_value, ...)]
   [LOCATION 'path']
-  ```  
+  ```
   
 ### Usage Guidelines
 
@@ -101,11 +101,11 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
      These properties are table level compaction configurations, if not specified, system level configurations in carbon.properties will be used.
      Following are 5 configurations:
      
-     * MAJOR_COMPACTION_SIZE: same meaning with carbon.major.compaction.size, size in MB.
-     * AUTO_LOAD_MERGE: same meaning with carbon.enable.auto.load.merge.
-     * COMPACTION_LEVEL_THRESHOLD: same meaning with carbon.compaction.level.threshold.
-     * COMPACTION_PRESERVE_SEGMENTS: same meaning with carbon.numberof.preserve.segments.
-     * ALLOWED_COMPACTION_DAYS: same meaning with carbon.allowed.compaction.days.     
+     * MAJOR_COMPACTION_SIZE: same meaning as carbon.major.compaction.size, size in MB.
+     * AUTO_LOAD_MERGE: same meaning as carbon.enable.auto.load.merge.
+     * COMPACTION_LEVEL_THRESHOLD: same meaning as carbon.compaction.level.threshold.
+     * COMPACTION_PRESERVE_SEGMENTS: same meaning as carbon.numberof.preserve.segments.
+     * ALLOWED_COMPACTION_DAYS: same meaning as carbon.allowed.compaction.days.     
 
      ```
      TBLPROPERTIES ('MAJOR_COMPACTION_SIZE'='2048',
@@ -136,17 +136,8 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
                                    saleQuantity Int,
                                    revenue Int)
     STORED BY 'carbondata'
-    TBLPROPERTIES ('DICTIONARY_INCLUDE'='productNumber',
-                   'NO_INVERTED_INDEX'='productBatch',
-                   'SORT_COLUMNS'='productName,storeCity',
-                   'SORT_SCOPE'='NO_SORT',
-                   'TABLE_BLOCKSIZE'='512',
-                   'MAJOR_COMPACTION_SIZE'='2048',
-                   'AUTO_LOAD_MERGE'='true',
-                   'COMPACTION_LEVEL_THRESHOLD'='5,6',
-                   'COMPACTION_PRESERVE_SEGMENTS'='10',
-				   'streaming'='true',
-                   'ALLOWED_COMPACTION_DAYS'='5')
+    TBLPROPERTIES ('SORT_COLUMNS'='productName,storeCity',
+                   'SORT_SCOPE'='NO_SORT')
    ```
 
 ## CREATE DATABASE 
@@ -200,9 +191,9 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
 
      Examples:
      ```
-     ALTER TABLE carbon RENAME TO carbondata
+     ALTER TABLE carbon RENAME TO carbonTable
      OR
-     ALTER TABLE test_db.carbon RENAME TO test_db.carbondata
+     ALTER TABLE test_db.carbon RENAME TO test_db.carbonTable
      ```
 
    - **ADD COLUMNS**
@@ -294,7 +285,7 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
   * Before executing this command the old table schema and data should be copied into the new database location.
   * If the table is aggregate table, then all the aggregate tables should be copied to the new database location.
   * For old store, the time zone of the source and destination cluster should be same.
-  * If old cluster uses HIVE meta store, refresh will not work as schema file does not exist in file system.
+  * If old cluster used HIVE meta store to store schema, refresh will not work as schema file does not exist in file system.
   
 
 ## LOAD DATA
@@ -302,7 +293,7 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
 ### LOAD FILES TO CARBONDATA TABLE
   
   This command is used to load csv files to carbondata, OPTIONS are not mandatory for data loading process. 
-  Inside OPTIONS user can provide either of any options like DELIMITER, QUOTECHAR, FILEHEADER, ESCAPECHAR, MULTILINE as per requirement.
+  Inside OPTIONS user can provide any options like DELIMITER, QUOTECHAR, FILEHEADER, ESCAPECHAR, MULTILINE as per requirement.
   
   ```
   LOAD DATA [LOCAL] INPATH 'folder_path' 
@@ -352,7 +343,7 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
     OPTIONS('MULTILINE'='true') 
     ```
 
-  - **ESCAPECHAR:** Escape char can be provided if user want strict validation of escape character on CSV.
+  - **ESCAPECHAR:** Escape char can be provided if user want strict validation of escape character in CSV files.
 
     ```
     OPTIONS('ESCAPECHAR'='\') 
@@ -435,10 +426,10 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
   * BAD_RECORDS_ACTION property can have four type of actions for bad records FORCE, REDIRECT, IGNORE and FAIL.
   * FAIL option is its Default value. If the FAIL option is used, then data loading fails if any bad records are found.
   * If the REDIRECT option is used, CarbonData will add all bad records in to a separate CSV file. However, this file must not be used for subsequent data loading because the content may not exactly match the source record. You are advised to cleanse the original source record for further data ingestion. This option is used to remind you which records are bad records.
-  * If the FORCE option is used, then it auto-corrects the data by storing the bad records as NULL before Loading data.
+  * If the FORCE option is used, then it auto-converts the data by storing the bad records as NULL before Loading data.
   * If the IGNORE option is used, then bad records are neither loaded nor written to the separate CSV file.
   * In loaded data, if all records are bad records, the BAD_RECORDS_ACTION is invalid and the load operation fails.
-  * The maximum number of characters per column is 100000. If there are more than 100000 characters in a column, data loading will fail.
+  * The maximum number of characters per column is 32000. If there are more than 32000 characters in a column, data loading will fail.
 
   Example:
 
@@ -563,7 +554,6 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
 ## COMPACTION
 
   Compaction improves the query performance significantly. 
-  During the load data, several CarbonData files are generated, this is because data is sorted only within each load (per load segment and one B+ tree index).
   
   There are two types of compaction, Minor and Major compaction.
   
@@ -587,6 +577,8 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
   
   In Major compaction, multiple segments can be merged into one large segment. 
   User will specify the compaction size until which segments can be merged, Major compaction is usually done during the off-peak time.
+  Configure the property carbon.major.compaction.size with appropriate value in MB.
+  
   This command merges the specified number of segments into one segment: 
      
   ```
@@ -963,8 +955,8 @@ roll-up for the queries on these hierarchies.
   USING "timeseries"
   DMPROPERTIES (
   'event_time’=’order_time’,
-  'year_granualrity’=’1’,
-  ) AS
+  'year_granularity’=’1’)
+  AS
   SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
    avg(price) FROM sales GROUP BY order_time, country, sex
     
@@ -973,8 +965,8 @@ roll-up for the queries on these hierarchies.
   USING "timeseries"
   DMPROPERTIES (
   'event_time’=’order_time’,
-  'month_granualrity’=’1’,
-  ) AS
+  'month_granularity’=’1’)
+  AS
   SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
    avg(price) FROM sales GROUP BY order_time, country, sex
     
@@ -983,8 +975,8 @@ roll-up for the queries on these hierarchies.
   USING "timeseries"
   DMPROPERTIES (
   'event_time’=’order_time’,
-  'day_granualrity’=’1’,
-  ) AS
+  'day_granularity’=’1’)
+  AS
   SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
    avg(price) FROM sales GROUP BY order_time, country, sex
         
@@ -993,8 +985,8 @@ roll-up for the queries on these hierarchies.
   USING "timeseries"
   DMPROPERTIES (
   'event_time’=’order_time’,
-  'hour_granualrity’=’1’,
-  ) AS
+  'hour_granularity’=’1’)
+  AS
   SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
    avg(price) FROM sales GROUP BY order_time, country, sex
   
@@ -1003,8 +995,8 @@ roll-up for the queries on these hierarchies.
   USING "timeseries"
   DMPROPERTIES (
   'event_time’=’order_time’,
-  'minute_granualrity’=’1’,
-  ) AS
+  'minute_granularity’=’1’)
+  AS
   SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
    avg(price) FROM sales GROUP BY order_time, country, sex
   ```
@@ -1030,8 +1022,8 @@ roll-up for the queries on these hierarchies.
     USING "timeseries"
     DMPROPERTIES (
     'event_time’=’order_time’,
-    'day_granualrity’=’1’,
-    ) AS
+    'day_granularity’=’1’)
+    AS
     SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
      avg(price) FROM sales GROUP BY order_time, country, sex
           
@@ -1040,8 +1032,8 @@ roll-up for the queries on these hierarchies.
     USING "timeseries"
     DMPROPERTIES (
     'event_time’=’order_time’,
-    'hour_granualrity’=’1’,
-    ) AS
+    'hour_granularity’=’1’)
+    AS
     SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
      avg(price) FROM sales GROUP BY order_time, country, sex
   ```
@@ -1078,8 +1070,8 @@ roll-up for the queries on these hierarchies.
   ```
 
   NOTE:
-  * Bucketing can not be performed for columns of Complex Data Types.
-  * Columns in the BUCKETCOLUMN parameter must be only dimension. The BUCKETCOLUMN parameter can not be a measure or a combination of measures and dimensions.
+  * Bucketing cannot be performed for columns of Complex Data Types.
+  * Columns in the BUCKETCOLUMN parameter must be dimensions. The BUCKETCOLUMN parameter cannot be a measure or a combination of measures and dimensions.
 
   Example:
   ```
@@ -1100,7 +1092,7 @@ roll-up for the queries on these hierarchies.
 
 ### SHOW SEGMENT
 
-  This command is used to get the segments of CarbonData table.
+  This command is used to list the segments of CarbonData table.
 
   ```
   SHOW SEGMENTS FOR TABLE [db_name.]table_name LIMIT number_of_segments

[17/25] carbondata git commit: [CARBONDATA-2212] Event fired while updating the status during streaming handoff

Posted by ra...@apache.org.

[CARBONDATA-2212] Event fired while updating the status during streaming handoff

This closes #2009


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/cf2390a8
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/cf2390a8
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/cf2390a8

Branch: refs/heads/branch-1.3
Commit: cf2390a8c6dfa95242dacc55ca9f7a9b2020a964
Parents: bbe7376
Author: rahulforallp <ra...@knoldus.in>
Authored: Tue Feb 27 22:00:40 2018 +0530
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 18:04:11 2018 +0530

----------------------------------------------------------------------
 .../apache/carbondata/streaming/StreamHandoffRDD.scala | 13 +++++++++++++
 1 file changed, 13 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/cf2390a8/streaming/src/main/scala/org/apache/carbondata/streaming/StreamHandoffRDD.scala
----------------------------------------------------------------------
diff --git a/streaming/src/main/scala/org/apache/carbondata/streaming/StreamHandoffRDD.scala b/streaming/src/main/scala/org/apache/carbondata/streaming/StreamHandoffRDD.scala
index 35a3513..b03ee1e 100644
--- a/streaming/src/main/scala/org/apache/carbondata/streaming/StreamHandoffRDD.scala
+++ b/streaming/src/main/scala/org/apache/carbondata/streaming/StreamHandoffRDD.scala
@@ -39,9 +39,11 @@ import org.apache.carbondata.core.scan.result.iterator.RawResultIterator
 import org.apache.carbondata.core.statusmanager.{LoadMetadataDetails, SegmentStatus, SegmentStatusManager}
 import org.apache.carbondata.core.util.CarbonUtil
 import org.apache.carbondata.core.util.path.CarbonStorePath
+import org.apache.carbondata.events.{OperationContext, OperationListenerBus}
 import org.apache.carbondata.hadoop.{CarbonInputSplit, CarbonProjection}
 import org.apache.carbondata.hadoop.api.CarbonTableInputFormat
 import org.apache.carbondata.hadoop.streaming.{CarbonStreamInputFormat, CarbonStreamRecordReader}
+import org.apache.carbondata.processing.loading.events.LoadEvents.{LoadTablePostStatusUpdateEvent, LoadTablePreStatusUpdateEvent}
 import org.apache.carbondata.processing.loading.model.CarbonLoadModel
 import org.apache.carbondata.processing.merger.{CompactionResultSortProcessor, CompactionType}
 import org.apache.carbondata.processing.util.CarbonLoaderUtil
@@ -307,7 +309,18 @@ object StreamHandoffRDD {
         SegmentStatus.INSERT_IN_PROGRESS,
         carbonLoadModel.getFactTimeStamp,
         false)
+      val operationContext = new OperationContext()
+      val loadTablePreStatusUpdateEvent: LoadTablePreStatusUpdateEvent =
+        new LoadTablePreStatusUpdateEvent(
+          carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable.getCarbonTableIdentifier,
+          carbonLoadModel)
+      OperationListenerBus.getInstance().fireEvent(loadTablePreStatusUpdateEvent, operationContext)
+
       CarbonLoaderUtil.recordNewLoadMetadata(newMetaEntry, carbonLoadModel, true, false)
+      val loadTablePostStatusUpdateEvent: LoadTablePostStatusUpdateEvent =
+        new LoadTablePostStatusUpdateEvent(carbonLoadModel)
+      OperationListenerBus.getInstance()
+        .fireEvent(loadTablePostStatusUpdateEvent, operationContext)
       // convert a streaming segment to columnar segment
       val status = new StreamHandoffRDD(
         sparkSession.sparkContext,

[06/25] carbondata git commit: [Documentation] Editorial review

Posted by ra...@apache.org.

[Documentation] Editorial review

correct some docus description

This closes #1992


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/e5d9802a
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/e5d9802a
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/e5d9802a

Branch: refs/heads/branch-1.3
Commit: e5d9802abe244e24a64fc883690632732d94f306
Parents: 6c25d24
Author: sgururajshetty <sg...@gmail.com>
Authored: Fri Feb 23 17:05:17 2018 +0530
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 17:46:26 2018 +0530

----------------------------------------------------------------------
 docs/data-management-on-carbondata.md | 36 +++++++++++++++---------------
 docs/faq.md                           |  4 ++--
 docs/troubleshooting.md               |  4 ++--
 docs/useful-tips-on-carbondata.md     |  2 +-
 4 files changed, 23 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/e5d9802a/docs/data-management-on-carbondata.md
----------------------------------------------------------------------
diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md
index f70e0b7..78ab010 100644
--- a/docs/data-management-on-carbondata.md
+++ b/docs/data-management-on-carbondata.md
@@ -178,7 +178,7 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
   SHOW TABLES IN defaultdb
   ```
 
-### ALTER TALBE
+### ALTER TABLE
 
   The following section introduce the commands to modify the physical or logical state of the existing table(s).
 
@@ -494,7 +494,7 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
   [ WHERE { <filter_condition> } ]
   ```
   
-  alternatively the following the command can also be used for updating the CarbonData Table :
+  alternatively the following command can also be used for updating the CarbonData Table :
   
   ```
   UPDATE <table_name>
@@ -674,7 +674,7 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
 
 #### Insert OVERWRITE
   
-  This command allows you to insert or load overwrite on a spcific partition.
+  This command allows you to insert or load overwrite on a specific partition.
   
   ```
    INSERT OVERWRITE TABLE table_name
@@ -898,50 +898,50 @@ will fetch the data from the main table **sales**
 For existing table with loaded data, data load to pre-aggregate table will be triggered by the 
 CREATE DATAMAP statement when user creates the pre-aggregate table.
 For incremental loads after aggregates tables are created, loading data to main table triggers 
-the load to pre-aggregate tables once main table loading is complete.These loads are automic 
+the load to pre-aggregate tables once main table loading is complete. These loads are automic 
 meaning that data on main table and aggregate tables are only visible to the user after all tables 
 are loaded
 
 ##### Querying data from pre-aggregate tables
-Pre-aggregate tables cannot be queries directly.Queries are to be made on main table.Internally 
-carbondata will check associated pre-aggregate tables with the main table and if the 
+Pre-aggregate tables cannot be queries directly. Queries are to be made on main table. Internally 
+carbondata will check associated pre-aggregate tables with the main table, and if the 
 pre-aggregate tables satisfy the query condition, the plan is transformed automatically to use 
-pre-aggregate table to fetch the data
+pre-aggregate table to fetch the data.
 
 ##### Compacting pre-aggregate tables
 Compaction command (ALTER TABLE COMPACT) need to be run separately on each pre-aggregate table.
 Running Compaction command on main table will **not automatically** compact the pre-aggregate 
 tables.Compaction is an optional operation for pre-aggregate table. If compaction is performed on
 main table but not performed on pre-aggregate table, all queries still can benefit from 
-pre-aggregate tables.To further improve performance on pre-aggregate tables, compaction can be 
+pre-aggregate tables. To further improve performance on pre-aggregate tables, compaction can be 
 triggered on pre-aggregate tables directly, it will merge the segments inside pre-aggregate table. 
 
 ##### Update/Delete Operations on pre-aggregate tables
 This functionality is not supported.
 
   NOTE (<b>RESTRICTION</b>):
-  * Update/Delete operations are <b>not supported</b> on main table which has pre-aggregate tables 
-  created on it.All the pre-aggregate tables <b>will have to be dropped</b> before update/delete 
-  operations can be performed on the main table.Pre-aggregate tables can be rebuilt manually 
+  Update/Delete operations are <b>not supported</b> on main table which has pre-aggregate tables 
+  created on it. All the pre-aggregate tables <b>will have to be dropped</b> before update/delete 
+  operations can be performed on the main table. Pre-aggregate tables can be rebuilt manually 
   after update/delete operations are completed
  
 ##### Delete Segment Operations on pre-aggregate tables
 This functionality is not supported.
 
   NOTE (<b>RESTRICTION</b>):
-  * Delete Segment operations are <b>not supported</b> on main table which has pre-aggregate tables 
-  created on it.All the pre-aggregate tables <b>will have to be dropped</b> before update/delete 
-  operations can be performed on the main table.Pre-aggregate tables can be rebuilt manually 
+  Delete Segment operations are <b>not supported</b> on main table which has pre-aggregate tables 
+  created on it. All the pre-aggregate tables <b>will have to be dropped</b> before update/delete 
+  operations can be performed on the main table. Pre-aggregate tables can be rebuilt manually 
   after delete segment operations are completed
   
 ##### Alter Table Operations on pre-aggregate tables
 This functionality is not supported.
 
   NOTE (<b>RESTRICTION</b>):
-  * Adding new column in new table does not have any affect on pre-aggregate tables. However if 
+  Adding new column in new table does not have any affect on pre-aggregate tables. However if 
   dropping or renaming a column has impact in pre-aggregate table, such operations will be 
-  rejected and error will be thrown.All the pre-aggregate tables <b>will have to be dropped</b> 
-  before Alter Operations can be performed on the main table.Pre-aggregate tables can be rebuilt 
+  rejected and error will be thrown. All the pre-aggregate tables <b>will have to be dropped</b> 
+  before Alter Operations can be performed on the main table. Pre-aggregate tables can be rebuilt 
   manually after Alter Table operations are completed
   
 ### Supporting timeseries data (Alpha feature in 1.3.0)
@@ -1012,7 +1012,7 @@ roll-up for the queries on these hierarchies.
   ```
   
   It is **not necessary** to create pre-aggregate tables for each granularity unless required for 
-  query.Carbondata can roll-up the data and fetch it.
+  query. Carbondata can roll-up the data and fetch it.
    
   For Example: For main table **sales** , If pre-aggregate tables were created as  
   

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e5d9802a/docs/faq.md
----------------------------------------------------------------------
diff --git a/docs/faq.md b/docs/faq.md
index baa46cc..8f04e4f 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -80,7 +80,7 @@ In order to build CarbonData project it is necessary to specify the spark profil
 
 ## How Carbon will behave when execute insert operation in abnormal scenarios?
 Carbon support insert operation, you can refer to the syntax mentioned in [DML Operations on CarbonData](dml-operation-on-carbondata.md).
-First, create a soucre table in spark-sql and load data into this created table.
+First, create a source table in spark-sql and load data into this created table.
 
 ```
 CREATE TABLE source_table(
@@ -124,7 +124,7 @@ id  city    name
 
 As result shows, the second column is city in carbon table, but what inside is name, such as jack. This phenomenon is same with insert data into hive table.
 
-If you want to insert data into corresponding column in carbon table, you have to specify the column order same in insert statment. 
+If you want to insert data into corresponding column in carbon table, you have to specify the column order same in insert statement. 
 
 ```
 INSERT INTO TABLE carbon_table SELECT id, city, name FROM source_table;

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e5d9802a/docs/troubleshooting.md
----------------------------------------------------------------------
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 68dd538..0156121 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -177,7 +177,7 @@ Note :  Refrain from using "mvn clean package" without specifying the profile.
   Data loading fails with the following exception :
 
    ```
-   Data Load failure exeception
+   Data Load failure exception
    ```
 
   **Possible Cause**
@@ -208,7 +208,7 @@ Note :  Refrain from using "mvn clean package" without specifying the profile.
   Insertion fails with the following exception :
 
    ```
-   Data Load failure exeception
+   Data Load failure exception
    ```
 
   **Possible Cause**

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e5d9802a/docs/useful-tips-on-carbondata.md
----------------------------------------------------------------------
diff --git a/docs/useful-tips-on-carbondata.md b/docs/useful-tips-on-carbondata.md
index aaf6460..4d43003 100644
--- a/docs/useful-tips-on-carbondata.md
+++ b/docs/useful-tips-on-carbondata.md
@@ -138,7 +138,7 @@
   |carbon.number.of.cores.while.loading|Default: 2.This value should be >= 2|Specifies the number of cores used for data processing during data loading in CarbonData. |
   |carbon.sort.size|Default: 100000. The value should be >= 100.|Threshold to write local file in sort step when loading data|
   |carbon.sort.file.write.buffer.size|Default:  50000.|DataOutputStream buffer. |
-  |carbon.number.of.cores.block.sort|Default: 7 | If you have huge memory and cpus, increase it as you will|
+  |carbon.number.of.cores.block.sort|Default: 7 | If you have huge memory and CPUs, increase it as you will|
   |carbon.merge.sort.reader.thread|Default: 3 |Specifies the number of cores used for temp file merging during data loading in CarbonData.|
   |carbon.merge.sort.prefetch|Default: true | You may want set this value to false if you have not enough memory|

[25/25] carbondata git commit: [CARBONDATA-2098] Add datamap managment description

Posted by ra...@apache.org.

[CARBONDATA-2098] Add datamap managment description

Enhance document for datamap

This closes #2026


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/ba5a70ad
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/ba5a70ad
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/ba5a70ad

Branch: refs/heads/branch-1.3
Commit: ba5a70adb85328b2cd5d7d372b51d32fd807ceb8
Parents: ec89341
Author: Jacky Li <ja...@qq.com>
Authored: Sat Mar 3 13:40:59 2018 +0800
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 18:12:01 2018 +0530

----------------------------------------------------------------------
 docs/datamap/preaggregate-datamap-guide.md      | 51 +++++++++++++++++---
 docs/datamap/timeseries-datamap-guide.md        | 23 ++++++---
 .../examples/PreAggregateTableExample.scala     |  2 +
 3 files changed, 64 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/ba5a70ad/docs/datamap/preaggregate-datamap-guide.md
----------------------------------------------------------------------
diff --git a/docs/datamap/preaggregate-datamap-guide.md b/docs/datamap/preaggregate-datamap-guide.md
index fabfd7d..199f674 100644
--- a/docs/datamap/preaggregate-datamap-guide.md
+++ b/docs/datamap/preaggregate-datamap-guide.md
@@ -1,5 +1,13 @@
 # CarbonData Pre-aggregate DataMap
   
+* [Quick Example](#quick-example)
+* [DataMap Management](#datamap-management)
+* [Pre-aggregate Table](#preaggregate-datamap-introduction)
+* [Loading Data](#loading-data)
+* [Querying Data](#querying-data)
+* [Compaction](#compacting-pre-aggregate-tables)
+* [Data Management](#data-management-with-pre-aggregate-tables)
+
 ## Quick example
 Download and unzip spark-2.2.0-bin-hadoop2.7.tgz, and export $SPARK_HOME
 
@@ -85,7 +93,35 @@ Start spark-shell in new terminal, type :paste, then copy and run the following
   spark.stop
 ```
 
-##PRE-AGGREGATE DataMap  
+#### DataMap Management
+DataMap can be created using following DDL
+  ```
+  CREATE DATAMAP [IF NOT EXISTS] datamap_name
+  ON TABLE main_table
+  USING "datamap_provider"
+  DMPROPERTIES ('key'='value', ...)
+  AS
+    SELECT statement
+  ```
+The string followed by USING is called DataMap Provider, in this version CarbonData supports two 
+kinds of DataMap: 
+1. preaggregate, for pre-aggregate table. No DMPROPERTY is required for this DataMap
+2. timeseries, for timeseries roll-up table. Please refer to [Timeseries DataMap](https://github.com/apache/carbondata/blob/master/docs/datamap/timeseries-datamap-guide.md)
+
+DataMap can be dropped using following DDL
+  ```
+  DROP DATAMAP [IF EXISTS] datamap_name
+  ON TABLE main_table
+  ```
+To show all DataMaps created, use:
+  ```
+  SHOW DATAMAP 
+  ON TABLE main_table
+  ```
+It will show all DataMaps created on main table.
+
+
+## Preaggregate DataMap Introduction
   Pre-aggregate tables are created as DataMaps and managed as tables internally by CarbonData. 
   User can create as many pre-aggregate datamaps required to improve query performance, 
   provided the storage requirements and loading speeds are acceptable.
@@ -163,7 +199,7 @@ SELECT country, max(price) from sales GROUP BY country
 will query against main table **sales** only, because it does not satisfy pre-aggregate table 
 selection logic. 
 
-#### Loading data to pre-aggregate tables
+## Loading data
 For existing table with loaded data, data load to pre-aggregate table will be triggered by the 
 CREATE DATAMAP statement when user creates the pre-aggregate table. For incremental loads after 
 aggregates tables are created, loading data to main table triggers the load to pre-aggregate tables 
@@ -174,7 +210,7 @@ meaning that data on main table and pre-aggregate tables are only visible to the
 tables are loaded successfully, if one of these loads fails, new data are not visible in all tables 
 as if the load operation is not happened.   
 
-#### Querying data from pre-aggregate tables
+## Querying data
 As a technique for query acceleration, Pre-aggregate tables cannot be queries directly. 
 Queries are to be made on main table. While doing query planning, internally CarbonData will check 
 associated pre-aggregate tables with the main table, and do query plan transformation accordingly. 
@@ -183,7 +219,8 @@ User can verify whether a query can leverage pre-aggregate table or not by execu
 command, which will show the transformed logical plan, and thus user can check whether pre-aggregate
 table is selected.
 
-#### Compacting pre-aggregate tables
+
+## Compacting pre-aggregate tables
 Running Compaction command (`ALTER TABLE COMPACT`) on main table will **not automatically** 
 compact the pre-aggregate tables created on the main table. User need to run Compaction command 
 separately on each pre-aggregate table to compact them.
@@ -193,8 +230,10 @@ main table but not performed on pre-aggregate table, all queries still can benef
 pre-aggregate tables. To further improve the query performance, compaction on pre-aggregate tables 
 can be triggered to merge the segments and files in the pre-aggregate tables. 
 
-#### Data Management on pre-aggregate tables
-Once there is pre-aggregate table created on the main table, following command on the main table
+## Data Management with pre-aggregate tables
+In current implementation, data consistence need to be maintained for both main table and pre-aggregate
+tables. Once there is pre-aggregate table created on the main table, following command on the main 
+table
 is not supported:
 1. Data management command: `UPDATE/DELETE/DELETE SEGMENT`. 
 2. Schema management command: `ALTER TABLE DROP COLUMN`, `ALTER TABLE CHANGE DATATYPE`, 

http://git-wip-us.apache.org/repos/asf/carbondata/blob/ba5a70ad/docs/datamap/timeseries-datamap-guide.md
----------------------------------------------------------------------
diff --git a/docs/datamap/timeseries-datamap-guide.md b/docs/datamap/timeseries-datamap-guide.md
index ecd7234..886c161 100644
--- a/docs/datamap/timeseries-datamap-guide.md
+++ b/docs/datamap/timeseries-datamap-guide.md
@@ -1,14 +1,25 @@
 # CarbonData Timeseries DataMap
 
-## Supporting timeseries data (Alpha feature in 1.3.0)
+* [Timeseries DataMap](#timeseries-datamap-intoduction-(alpha-feature-in-1.3.0))
+* [Compaction](#compacting-pre-aggregate-tables)
+* [Data Management](#data-management-with-pre-aggregate-tables)
+
+## Timeseries DataMap Intoduction (Alpha feature in 1.3.0)
 Timeseries DataMap a pre-aggregate table implementation based on 'preaggregate' DataMap. 
 Difference is that Timerseries DataMap has built-in understanding of time hierarchy and 
 levels: year, month, day, hour, minute, so that it supports automatic roll-up in time dimension 
 for query.
+
+The data loading, querying, compaction command and its behavior is the same as preaggregate DataMap.
+Please refer to [Pre-aggregate DataMap](https://github.com/apache/carbondata/blob/master/docs/datamap/preaggregate-datamap-guide.md)
+for more information.
   
-For instance, user can create multiple timeseries datamap on the main table which has a *event_time*
-column, one datamap for one time granularity. Then Carbondata can do automatic roll-up for queries 
-on the main table.
+To use this datamap, user can create multiple timeseries datamap on the main table which has 
+a *event_time* column, one datamap for one time granularity. Then Carbondata can do automatic 
+roll-up for queries on the main table.
+
+For example, below statement effectively create multiple pre-aggregate tables  on main table called 
+**timeseries**
 
 ```
 CREATE DATAMAP agg_year
@@ -126,10 +137,10 @@ the future CarbonData release.
 * timeseries datamaps created for each level needs to be dropped separately 
       
 
-#### Compacting timeseries datamp
+## Compacting timeseries datamp
 Refer to Compaction section in [preaggregation datamap](https://github.com/apache/carbondata/blob/master/docs/datamap/preaggregate-datamap-guide.md). 
 Same applies to timeseries datamap.
 
-#### Data Management on timeseries datamap
+## Data Management on timeseries datamap
 Refer to Data Management section in [preaggregation datamap](https://github.com/apache/carbondata/blob/master/docs/datamap/preaggregate-datamap-guide.md).
 Same applies to timeseries datamap.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/carbondata/blob/ba5a70ad/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateTableExample.scala
----------------------------------------------------------------------
diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateTableExample.scala b/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateTableExample.scala
index ace3dcc..64ed525 100644
--- a/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateTableExample.scala
+++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateTableExample.scala
@@ -99,6 +99,8 @@ object PreAggregateTableExample {
       s"""create datamap preagg_count on table maintable using 'preaggregate' as
          | select name, count(*) from maintable group by name""".stripMargin)
 
+    spark.sql("show datamap on table maintable").show
+
     spark.sql(
       s"""
          | SELECT id,max(age)

[18/25] carbondata git commit: [CARBONDATA-2055][Streaming] Support integrating Stream table with Spark Streaming

Posted by ra...@apache.org.

[CARBONDATA-2055][Streaming] Support integrating Stream table with Spark Streaming

This closes #1867


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/6bb5a2b0
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/6bb5a2b0
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/6bb5a2b0

Branch: refs/heads/branch-1.3
Commit: 6bb5a2b0a0a8177f14d90177b44e74d38eb69feb
Parents: cf2390a
Author: Zhang Zhichao <44...@qq.com>
Authored: Sat Jan 27 00:03:19 2018 +0800
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 18:04:28 2018 +0530

----------------------------------------------------------------------
 .../CarbonBatchSparkStreamingExample.scala      |   6 +-
 .../CarbonStreamSparkStreamingExample.scala     | 218 +++++++++++++++++++
 ...CarbonStructuredStreamingWithRowParser.scala |   2 +-
 integration/spark2/pom.xml                      |   6 +
 .../spark/sql/CarbonSparkStreamingFactory.scala |  60 +++++
 .../TestStreamingTableWithRowParser.scala       |   2 +-
 streaming/pom.xml                               |   6 +
 .../streaming/parser/CarbonStreamParser.java    |   3 +
 .../CarbonSparkStreamingListener.scala          |  31 +++
 .../streaming/CarbonStreamSparkStreaming.scala  | 187 ++++++++++++++++
 10 files changed, 514 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/6bb5a2b0/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonBatchSparkStreamingExample.scala
----------------------------------------------------------------------
diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonBatchSparkStreamingExample.scala b/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonBatchSparkStreamingExample.scala
index 6ae87b9..ef4dbce 100644
--- a/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonBatchSparkStreamingExample.scala
+++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonBatchSparkStreamingExample.scala
@@ -167,15 +167,11 @@ object CarbonBatchSparkStreamingExample {
         .map(fields => DStreamData(fields(0).toInt, fields(1), fields(2), fields(3).toFloat))
 
       batchData.foreachRDD { (rdd: RDD[DStreamData], time: Time) => {
-        val df = SparkSession.builder().getOrCreate()
-          .createDataFrame(rdd).toDF("id", "name", "city", "salary")
+        val df = spark.createDataFrame(rdd).toDF("id", "name", "city", "salary")
         println("at time: " + time.toString() + " the count of received data: " + df.count())
         df.write
           .format("carbondata")
           .option("tableName", tableName)
-          .option("tempCSV", "false")
-          .option("compress", "true")
-          .option("single_pass", "true")
           .mode(SaveMode.Append)
           .save()
       }}

http://git-wip-us.apache.org/repos/asf/carbondata/blob/6bb5a2b0/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStreamSparkStreamingExample.scala
----------------------------------------------------------------------
diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStreamSparkStreamingExample.scala b/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStreamSparkStreamingExample.scala
new file mode 100644
index 0000000..f59a610
--- /dev/null
+++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStreamSparkStreamingExample.scala
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.examples
+
+import java.io.{File, PrintWriter}
+import java.net.ServerSocket
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.CarbonEnv
+import org.apache.spark.sql.CarbonSparkStreamingFactory
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.SaveMode
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.streaming.{Seconds, StreamingContext, Time}
+
+import org.apache.carbondata.core.constants.CarbonCommonConstants
+import org.apache.carbondata.core.util.CarbonProperties
+import org.apache.carbondata.core.util.path.{CarbonStorePath, CarbonTablePath}
+import org.apache.carbondata.streaming.CarbonSparkStreamingListener
+import org.apache.carbondata.streaming.parser.CarbonStreamParser
+
+/**
+ * This example introduces how to use Spark Streaming to write data
+ * to CarbonData stream table.
+ *
+ * NOTE: Current integration with Spark Streaming is an alpha feature.
+ */
+// scalastyle:off println
+object CarbonStreamSparkStreamingExample {
+
+  def main(args: Array[String]): Unit = {
+
+    // setup paths
+    val rootPath = new File(this.getClass.getResource("/").getPath
+                            + "../../../..").getCanonicalPath
+    val checkpointPath =
+      s"$rootPath/examples/spark2/target/spark_streaming_cp_" +
+      System.currentTimeMillis().toString()
+    val streamTableName = s"dstream_stream_table"
+
+    val spark = ExampleUtils.createCarbonSession("CarbonStreamSparkStreamingExample", 4)
+
+    val requireCreateTable = true
+
+    if (requireCreateTable) {
+      // drop table if exists previously
+      spark.sql(s"DROP TABLE IF EXISTS ${ streamTableName }")
+      // Create target carbon table and populate with initial data
+      spark.sql(
+        s"""
+           | CREATE TABLE ${ streamTableName }(
+           | id INT,
+           | name STRING,
+           | city STRING,
+           | salary FLOAT
+           | )
+           | STORED BY 'carbondata'
+           | TBLPROPERTIES(
+           | 'streaming'='true',
+           | 'sort_columns'='name',
+           | 'dictionary_include'='city')
+           | """.stripMargin)
+      val carbonTable = CarbonEnv.getCarbonTable(Some("default"), streamTableName)(spark)
+      val tablePath = CarbonStorePath.getCarbonTablePath(carbonTable.getAbsoluteTableIdentifier)
+      // batch load
+      val path = s"$rootPath/examples/spark2/src/main/resources/streamSample.csv"
+      spark.sql(
+        s"""
+           | LOAD DATA LOCAL INPATH '$path'
+           | INTO TABLE $streamTableName
+           | OPTIONS('HEADER'='true')
+         """.stripMargin)
+
+      // streaming ingest
+      val serverSocket = new ServerSocket(7071)
+      val thread1 = writeSocket(serverSocket)
+      val thread2 = showTableCount(spark, streamTableName)
+      val ssc = startStreaming(spark, streamTableName, tablePath, checkpointPath)
+      // add a Spark Streaming Listener to remove all lock for stream tables when stop app
+      ssc.sparkContext.addSparkListener(new CarbonSparkStreamingListener())
+      // wait for stop signal to stop Spark Streaming App
+      waitForStopSignal(ssc)
+      // it need to start Spark Streaming App in main thread
+      // otherwise it will encounter an not-serializable exception.
+      ssc.start()
+      ssc.awaitTermination()
+      thread1.interrupt()
+      thread2.interrupt()
+      serverSocket.close()
+    }
+
+    spark.sql(s"select count(*) from ${ streamTableName }").show(100, truncate = false)
+
+    spark.sql(s"select * from ${ streamTableName } order by id desc").show(100, truncate = false)
+
+    // record(id = 100000001) comes from batch segment_0
+    // record(id = 1) comes from stream segment_1
+    spark.sql(s"select * " +
+              s"from ${ streamTableName } " +
+              s"where id = 100000001 or id = 1 limit 100").show(100, truncate = false)
+
+    // not filter
+    spark.sql(s"select * " +
+              s"from ${ streamTableName } " +
+              s"where id < 10 limit 100").show(100, truncate = false)
+
+    // show segments
+    spark.sql(s"SHOW SEGMENTS FOR TABLE ${streamTableName}").show(false)
+
+    spark.stop()
+    System.out.println("streaming finished")
+  }
+
+  def showTableCount(spark: SparkSession, tableName: String): Thread = {
+    val thread = new Thread() {
+      override def run(): Unit = {
+        for (_ <- 0 to 1000) {
+          println(System.currentTimeMillis())
+          spark.sql(s"select count(*) from $tableName").show(truncate = false)
+          spark.sql(s"SHOW SEGMENTS FOR TABLE ${tableName}").show(false)
+          Thread.sleep(1000 * 5)
+        }
+      }
+    }
+    thread.start()
+    thread
+  }
+
+  def waitForStopSignal(ssc: StreamingContext): Thread = {
+    val thread = new Thread() {
+      override def run(): Unit = {
+        // use command 'nc 127.0.0.1 7072' to stop Spark Streaming App
+        new ServerSocket(7072).accept()
+        // don't stop SparkContext here
+        ssc.stop(false, true)
+      }
+    }
+    thread.start()
+    thread
+  }
+
+  def startStreaming(spark: SparkSession, tableName: String,
+      tablePath: CarbonTablePath, checkpointPath: String): StreamingContext = {
+    var ssc: StreamingContext = null
+    try {
+      // recommend: the batch interval must set larger, such as 30s, 1min.
+      ssc = new StreamingContext(spark.sparkContext, Seconds(30))
+      ssc.checkpoint(checkpointPath)
+
+      val readSocketDF = ssc.socketTextStream("localhost", 7071)
+
+      val batchData = readSocketDF
+        .map(_.split(","))
+        .map(fields => DStreamData(fields(0).toInt, fields(1), fields(2), fields(3).toFloat))
+
+      println("init carbon table info")
+      batchData.foreachRDD { (rdd: RDD[DStreamData], time: Time) => {
+        val df = spark.createDataFrame(rdd).toDF()
+        println(System.currentTimeMillis().toString() +
+          " at batch time: " + time.toString() +
+          " the count of received data: " + df.count())
+        CarbonSparkStreamingFactory.getStreamSparkStreamingWriter(spark, "default", tableName)
+          .option(CarbonStreamParser.CARBON_STREAM_PARSER,
+            CarbonStreamParser.CARBON_STREAM_PARSER_ROW_PARSER)
+          .mode(SaveMode.Append)
+          .writeStreamData(df, time)
+      }}
+    } catch {
+      case ex: Exception =>
+        ex.printStackTrace()
+        println("Done reading and writing streaming data")
+    }
+    ssc
+  }
+
+  def writeSocket(serverSocket: ServerSocket): Thread = {
+    val thread = new Thread() {
+      override def run(): Unit = {
+        // wait for client to connection request and accept
+        val clientSocket = serverSocket.accept()
+        val socketWriter = new PrintWriter(clientSocket.getOutputStream())
+        var index = 0
+        for (_ <- 1 to 1000) {
+          // write 5 records per iteration
+          for (_ <- 0 to 100) {
+            index = index + 1
+            socketWriter.println(index.toString + ",name_" + index
+                                 + ",city_" + index + "," + (index * 10000.00).toString +
+                                 ",school_" + index + ":school_" + index + index + "$" + index)
+          }
+          socketWriter.flush()
+          Thread.sleep(2000)
+        }
+        socketWriter.close()
+        System.out.println("Socket closed")
+      }
+    }
+    thread.start()
+    thread
+  }
+}
+// scalastyle:on println

http://git-wip-us.apache.org/repos/asf/carbondata/blob/6bb5a2b0/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStructuredStreamingWithRowParser.scala
----------------------------------------------------------------------
diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStructuredStreamingWithRowParser.scala b/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStructuredStreamingWithRowParser.scala
index f134a8d..cce833b 100644
--- a/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStructuredStreamingWithRowParser.scala
+++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStructuredStreamingWithRowParser.scala
@@ -171,7 +171,7 @@ object CarbonStructuredStreamingWithRowParser {
             .option("dbName", "default")
             .option("tableName", "stream_table_with_row_parser")
             .option(CarbonStreamParser.CARBON_STREAM_PARSER,
-              "org.apache.carbondata.streaming.parser.RowStreamParserImp")
+              CarbonStreamParser.CARBON_STREAM_PARSER_ROW_PARSER)
             .start()
 
           qry.awaitTermination()

http://git-wip-us.apache.org/repos/asf/carbondata/blob/6bb5a2b0/integration/spark2/pom.xml
----------------------------------------------------------------------
diff --git a/integration/spark2/pom.xml b/integration/spark2/pom.xml
index 9ac240b..90a5891 100644
--- a/integration/spark2/pom.xml
+++ b/integration/spark2/pom.xml
@@ -48,6 +48,12 @@
       <artifactId>spark-repl_${scala.binary.version}</artifactId>
     </dependency>
     <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming_${scala.binary.version}</artifactId>
+      <version>${spark.version}</version>
+      <scope>${spark.deps.scope}</scope>
+    </dependency>
+    <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
     </dependency>

http://git-wip-us.apache.org/repos/asf/carbondata/blob/6bb5a2b0/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonSparkStreamingFactory.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonSparkStreamingFactory.scala b/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonSparkStreamingFactory.scala
new file mode 100644
index 0000000..15b038b
--- /dev/null
+++ b/integration/spark2/src/main/scala/org/apache/spark/sql/CarbonSparkStreamingFactory.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.commons.lang3.StringUtils
+
+import org.apache.carbondata.streaming.CarbonStreamException
+import org.apache.carbondata.streaming.CarbonStreamSparkStreaming
+import org.apache.carbondata.streaming.CarbonStreamSparkStreamingWriter
+
+/**
+ * Create [[CarbonStreamSparkStreamingWriter]] for stream table
+ * when integrate with Spark Streaming.
+ *
+ * NOTE: Current integration with Spark Streaming is an alpha feature.
+ */
+object CarbonSparkStreamingFactory {
+
+  def getStreamSparkStreamingWriter(spark: SparkSession,
+    dbNameStr: String,
+    tableName: String): CarbonStreamSparkStreamingWriter =
+    synchronized {
+    val dbName = if (StringUtils.isEmpty(dbNameStr)) "default" else dbNameStr
+    val key = dbName + "." + tableName
+    if (CarbonStreamSparkStreaming.getTableMap.containsKey(key)) {
+      CarbonStreamSparkStreaming.getTableMap.get(key)
+    } else {
+      if (StringUtils.isEmpty(tableName) || tableName.contains(" ")) {
+        throw new CarbonStreamException("Table creation failed. " +
+                                        "Table name must not be blank or " +
+                                        "cannot contain blank space")
+      }
+      val carbonTable = CarbonEnv.getCarbonTable(Some(dbName),
+        tableName)(spark)
+      if (!carbonTable.isStreamingTable) {
+        throw new CarbonStreamException(s"Table ${carbonTable.getDatabaseName}." +
+                                        s"${carbonTable.getTableName} is not a streaming table")
+      }
+      val streamWriter = new CarbonStreamSparkStreamingWriter(spark,
+        carbonTable, spark.sessionState.newHadoopConf())
+      CarbonStreamSparkStreaming.getTableMap.put(key, streamWriter)
+      streamWriter
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/carbondata/blob/6bb5a2b0/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableWithRowParser.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableWithRowParser.scala b/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableWithRowParser.scala
index a3df2be..3e3b2c5 100644
--- a/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableWithRowParser.scala
+++ b/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableWithRowParser.scala
@@ -784,7 +784,7 @@ class TestStreamingTableWithRowParser extends QueryTest with BeforeAndAfterAll {
             .option("timestampformat", CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT)
             .option(CarbonCommonConstants.ENABLE_AUTO_HANDOFF, autoHandoff)
             .option(CarbonStreamParser.CARBON_STREAM_PARSER,
-              "org.apache.carbondata.streaming.parser.RowStreamParserImp")
+              CarbonStreamParser.CARBON_STREAM_PARSER_ROW_PARSER)
             .start()
           qry.awaitTermination()
         } catch {

http://git-wip-us.apache.org/repos/asf/carbondata/blob/6bb5a2b0/streaming/pom.xml
----------------------------------------------------------------------
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 40e3d33..1d4dc7f 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -24,6 +24,12 @@
       <version>${project.version}</version>
     </dependency>
     <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming_${scala.binary.version}</artifactId>
+      <version>${spark.version}</version>
+      <scope>${spark.deps.scope}</scope>
+    </dependency>
+    <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
       <scope>test</scope>

http://git-wip-us.apache.org/repos/asf/carbondata/blob/6bb5a2b0/streaming/src/main/java/org/apache/carbondata/streaming/parser/CarbonStreamParser.java
----------------------------------------------------------------------
diff --git a/streaming/src/main/java/org/apache/carbondata/streaming/parser/CarbonStreamParser.java b/streaming/src/main/java/org/apache/carbondata/streaming/parser/CarbonStreamParser.java
index 643758c..e335626 100644
--- a/streaming/src/main/java/org/apache/carbondata/streaming/parser/CarbonStreamParser.java
+++ b/streaming/src/main/java/org/apache/carbondata/streaming/parser/CarbonStreamParser.java
@@ -31,6 +31,9 @@ public interface CarbonStreamParser {
   String CARBON_STREAM_PARSER_DEFAULT =
       "org.apache.carbondata.streaming.parser.CSVStreamParserImp";
 
+  String CARBON_STREAM_PARSER_ROW_PARSER =
+      "org.apache.carbondata.streaming.parser.RowStreamParserImp";
+
   void initialize(Configuration configuration, StructType structType);
 
   Object[] parserRow(InternalRow value);

http://git-wip-us.apache.org/repos/asf/carbondata/blob/6bb5a2b0/streaming/src/main/scala/org/apache/carbondata/streaming/CarbonSparkStreamingListener.scala
----------------------------------------------------------------------
diff --git a/streaming/src/main/scala/org/apache/carbondata/streaming/CarbonSparkStreamingListener.scala b/streaming/src/main/scala/org/apache/carbondata/streaming/CarbonSparkStreamingListener.scala
new file mode 100644
index 0000000..6d1fa45
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/carbondata/streaming/CarbonSparkStreamingListener.scala
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.streaming
+
+import org.apache.spark.scheduler.SparkListener
+import org.apache.spark.scheduler.SparkListenerApplicationEnd
+
+class CarbonSparkStreamingListener extends SparkListener {
+
+  /**
+   * When Spark Streaming App stops, remove all locks for stream table.
+   */
+  override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = {
+    CarbonStreamSparkStreaming.cleanAllLockAfterStop()
+  }
+}

http://git-wip-us.apache.org/repos/asf/carbondata/blob/6bb5a2b0/streaming/src/main/scala/org/apache/carbondata/streaming/CarbonStreamSparkStreaming.scala
----------------------------------------------------------------------
diff --git a/streaming/src/main/scala/org/apache/carbondata/streaming/CarbonStreamSparkStreaming.scala b/streaming/src/main/scala/org/apache/carbondata/streaming/CarbonStreamSparkStreaming.scala
new file mode 100644
index 0000000..4aa1517
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/carbondata/streaming/CarbonStreamSparkStreaming.scala
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.streaming
+
+import java.util
+
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.execution.streaming.CarbonAppendableStreamSink
+import org.apache.spark.sql.execution.streaming.Sink
+import org.apache.spark.sql.SaveMode
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.streaming.Time
+
+import org.apache.carbondata.common.logging.LogServiceFactory
+import org.apache.carbondata.core.locks.{CarbonLockFactory, ICarbonLock, LockUsage}
+import org.apache.carbondata.core.metadata.schema.table.CarbonTable
+
+/**
+ * Interface used to write stream data to stream table
+ * when integrate with Spark Streaming.
+ *
+ * NOTE: Current integration with Spark Streaming is an alpha feature.
+ */
+class CarbonStreamSparkStreamingWriter(val sparkSession: SparkSession,
+    val carbonTable: CarbonTable,
+    val configuration: Configuration) {
+
+  private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName)
+
+  private var isInitialize: Boolean = false
+
+  private var lock: ICarbonLock = null
+  private var carbonAppendableStreamSink: Sink = null
+
+  /**
+   * Acquired the lock for stream table
+   */
+  def lockStreamTable(): Unit = {
+    lock = CarbonLockFactory.getCarbonLockObj(carbonTable.getAbsoluteTableIdentifier,
+      LockUsage.STREAMING_LOCK)
+    if (lock.lockWithRetries()) {
+      LOGGER.info("Acquired the lock for stream table: " +
+                  carbonTable.getDatabaseName + "." +
+                  carbonTable.getTableName)
+    } else {
+      LOGGER.error("Not able to acquire the lock for stream table:" +
+                   carbonTable.getDatabaseName + "." + carbonTable.getTableName)
+      throw new InterruptedException(
+        "Not able to acquire the lock for stream table: " + carbonTable.getDatabaseName + "." +
+        carbonTable.getTableName)
+    }
+  }
+
+  /**
+   * unlock for stream table
+   */
+  def unLockStreamTable(): Unit = {
+    if (null != lock) {
+      lock.unlock()
+      LOGGER.info("unlock for stream table: " +
+                  carbonTable.getDatabaseName + "." +
+                  carbonTable.getTableName)
+    }
+  }
+
+  def initialize(): Unit = {
+    carbonAppendableStreamSink = StreamSinkFactory.createStreamTableSink(
+      sparkSession,
+      configuration,
+      carbonTable,
+      extraOptions.toMap).asInstanceOf[CarbonAppendableStreamSink]
+
+    lockStreamTable()
+
+    isInitialize = true
+  }
+
+  def writeStreamData(dataFrame: DataFrame, time: Time): Unit = {
+    if (!isInitialize) {
+      initialize()
+    }
+    carbonAppendableStreamSink.addBatch(time.milliseconds, dataFrame)
+  }
+
+  private val extraOptions = new scala.collection.mutable.HashMap[String, String]
+  private var mode: SaveMode = SaveMode.ErrorIfExists
+
+  this.option("dbName", carbonTable.getDatabaseName)
+  this.option("tableName", carbonTable.getTableName)
+
+  /**
+   * Specifies the behavior when data or table already exists. Options include:
+   *   - `SaveMode.Overwrite`: overwrite the existing data.
+   *   - `SaveMode.Append`: append the data.
+   *   - `SaveMode.Ignore`: ignore the operation (i.e. no-op).
+   *   - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
+   */
+  def mode(saveMode: SaveMode): CarbonStreamSparkStreamingWriter = {
+    if (mode == SaveMode.ErrorIfExists) {
+      mode = saveMode
+    }
+    this
+  }
+
+  /**
+   * Specifies the behavior when data or table already exists. Options include:
+   *   - `overwrite`: overwrite the existing data.
+   *   - `append`: append the data.
+   *   - `ignore`: ignore the operation (i.e. no-op).
+   *   - `error or default`: default option, throw an exception at runtime.
+   */
+  def mode(saveMode: String): CarbonStreamSparkStreamingWriter = {
+    if (mode == SaveMode.ErrorIfExists) {
+      mode = saveMode.toLowerCase(util.Locale.ROOT) match {
+        case "overwrite" => SaveMode.Overwrite
+        case "append" => SaveMode.Append
+        case "ignore" => SaveMode.Ignore
+        case "error" | "default" => SaveMode.ErrorIfExists
+        case _ => throw new IllegalArgumentException(s"Unknown save mode: $saveMode. " +
+          "Accepted save modes are 'overwrite', 'append', 'ignore', 'error'.")
+      }
+    }
+    this
+  }
+
+  /**
+   * Adds an output option
+   */
+  def option(key: String, value: String): CarbonStreamSparkStreamingWriter = {
+    if (!extraOptions.contains(key)) {
+      extraOptions += (key -> value)
+    }
+    this
+  }
+
+  /**
+   * Adds an output option
+   */
+  def option(key: String, value: Boolean): CarbonStreamSparkStreamingWriter =
+    option(key, value.toString)
+
+  /**
+   * Adds an output option
+   */
+  def option(key: String, value: Long): CarbonStreamSparkStreamingWriter =
+    option(key, value.toString)
+
+  /**
+   * Adds an output option
+   */
+  def option(key: String, value: Double): CarbonStreamSparkStreamingWriter =
+    option(key, value.toString)
+}
+
+object CarbonStreamSparkStreaming {
+
+  @transient private val tableMap =
+    new util.HashMap[String, CarbonStreamSparkStreamingWriter]()
+
+  def getTableMap: util.Map[String, CarbonStreamSparkStreamingWriter] = tableMap
+
+  /**
+   * remove all stream lock.
+   */
+  def cleanAllLockAfterStop(): Unit = {
+    tableMap.asScala.values.foreach { writer => writer.unLockStreamTable() }
+    tableMap.clear()
+  }
+}

[04/25] carbondata git commit: [CARBONDATA-2138] Added documentation for HEADER option while loading data

Posted by ra...@apache.org.

[CARBONDATA-2138] Added documentation for HEADER option while loading data

Added documentation for HEADER option in load data

This closes #1938


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/4033f4ce
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/4033f4ce
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/4033f4ce

Branch: refs/heads/branch-1.3
Commit: 4033f4ce4f1ba280fca662de07c1253c7896b9e5
Parents: fd481f5
Author: sgururajshetty <sg...@gmail.com>
Authored: Tue Feb 6 20:58:59 2018 +0530
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 17:40:07 2018 +0530

----------------------------------------------------------------------
 docs/data-management-on-carbondata.md | 11 +++++++++++
 1 file changed, 11 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/4033f4ce/docs/data-management-on-carbondata.md
----------------------------------------------------------------------
diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md
index fba2916..61bb356 100644
--- a/docs/data-management-on-carbondata.md
+++ b/docs/data-management-on-carbondata.md
@@ -330,6 +330,16 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
     OPTIONS('COMMENTCHAR'='#')
     ```
 
+  - **HEADER:** When you load the CSV file without the file header and the file header is the same with the table schema, then add 'HEADER'='false' to load data SQL as user need not provide the file header. By default the value is 'true'.
+  false: CSV file is without file header.
+  true: CSV file is with file header.
+  
+    ```
+    OPTIONS('HEADER'='false') 
+    ```
+
+	NOTE: If the HEADER option exist and is set to 'true', then the FILEHEADER option is not required.
+	
   - **FILEHEADER:** Headers can be provided in the LOAD DATA command if headers are missing in the source files.
 
     ```
@@ -402,6 +412,7 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
    ```
    LOAD DATA local inpath '/opt/rawdata/data.csv' INTO table carbontable
    options('DELIMITER'=',', 'QUOTECHAR'='"','COMMENTCHAR'='#',
+   'HEADER'='false',
    'FILEHEADER'='empno,empname,designation,doj,workgroupcategory,
    workgroupcategoryname,deptno,deptname,projectcode,
    projectjoindate,projectenddate,attendance,utilization,salary',

[12/25] carbondata git commit: [CARBONDATA-2161] update mergeTo column for compacted segment of streaming table

Posted by ra...@apache.org.

[CARBONDATA-2161] update mergeTo column for compacted segment of streaming table

This closes #1971


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/36e770ce
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/36e770ce
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/36e770ce

Branch: refs/heads/branch-1.3
Commit: 36e770ce631e2bdb741fbd0d0ac60064399946f9
Parents: 95ac5ef
Author: BJangir <ba...@gmail.com>
Authored: Mon Feb 12 01:02:30 2018 +0530
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 17:55:40 2018 +0530

----------------------------------------------------------------------
 .../org/apache/spark/carbondata/TestStreamingTableOperation.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/36e770ce/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableOperation.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableOperation.scala b/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableOperation.scala
index 5644302..881af3a 100644
--- a/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableOperation.scala
+++ b/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableOperation.scala
@@ -1074,7 +1074,7 @@ class TestStreamingTableOperation extends QueryTest with BeforeAndAfterAll {
     //Verify MergeTO column entry for compacted Segments
     newSegments.filter(_.getString(1).equals("Compacted")).foreach{ rw =>
       assertResult("Compacted")(rw.getString(1))
-      assert(Integer.parseInt(rw.getString(0)) < Integer.parseInt(rw.getString(4)))
+      assertResult((Integer.parseInt(rw.getString(0))+2).toString)(rw.getString(4))
     }
     checkAnswer(
       sql("select count(*) from streaming.stream_table_reopen"),

[10/25] carbondata git commit: [CARBONDATA-2144] Optimize preaggregate table documentation, include timeseries

Posted by ra...@apache.org.

[CARBONDATA-2144] Optimize preaggregate table documentation, include timeseries

Optimize preaggregate table documentation, include timeseries

This closes #1949


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/877172c7
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/877172c7
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/877172c7

Branch: refs/heads/branch-1.3
Commit: 877172c769af44b3166243b42841d464b8e0c261
Parents: 87361a8
Author: xubo245 <60...@qq.com>
Authored: Fri Mar 2 17:42:40 2018 +0800
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 17:48:13 2018 +0530

----------------------------------------------------------------------
 docs/data-management-on-carbondata.md | 139 +++++++++++++++--------------
 1 file changed, 71 insertions(+), 68 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/877172c7/docs/data-management-on-carbondata.md
----------------------------------------------------------------------
diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md
index 9678a32..ea80d41 100644
--- a/docs/data-management-on-carbondata.md
+++ b/docs/data-management-on-carbondata.md
@@ -127,14 +127,14 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
 
    ```
     CREATE TABLE IF NOT EXISTS productSchema.productSalesTable (
-                                   productNumber Int,
-                                   productName String,
-                                   storeCity String,
-                                   storeProvince String,
-                                   productCategory String,
-                                   productBatch String,
-                                   saleQuantity Int,
-                                   revenue Int)
+                                   productNumber INT,
+                                   productName STRING,
+                                   storeCity STRING,
+                                   storeProvince STRING,
+                                   productCategory STRING,
+                                   productBatch STRING,
+                                   saleQuantity INT,
+                                   revenue INT)
     STORED BY 'carbondata'
     TBLPROPERTIES ('SORT_COLUMNS'='productName,storeCity',
                    'SORT_SCOPE'='NO_SORT')
@@ -647,13 +647,13 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
   Example:
   ```
    CREATE TABLE IF NOT EXISTS productSchema.productSalesTable (
-                                productNumber Int,
-                                productName String,
-                                storeCity String,
-                                storeProvince String,
-                                saleQuantity Int,
-                                revenue Int)
-  PARTITIONED BY (productCategory String, productBatch String)
+                                productNumber INT,
+                                productName STRING,
+                                storeCity STRING,
+                                storeProvince STRING,
+                                saleQuantity INT,
+                                revenue INT)
+  PARTITIONED BY (productCategory STRING, productBatch STRING)
   STORED BY 'carbondata'
   ```
 		
@@ -745,12 +745,12 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
   Example:
   ```
   CREATE TABLE IF NOT EXISTS hash_partition_table(
-      col_A String,
-      col_B Int,
-      col_C Long,
-      col_D Decimal(10,2),
-      col_F Timestamp
-  ) PARTITIONED BY (col_E Long)
+      col_A STRING,
+      col_B INT,
+      col_C LONG,
+      col_D DECIMAL(10,2),
+      col_F TIMESTAMP
+  ) PARTITIONED BY (col_E LONG)
   STORED BY 'carbondata' TBLPROPERTIES('PARTITION_TYPE'='HASH','NUM_PARTITIONS'='9')
   ```
 
@@ -773,11 +773,11 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
   Example:
   ```
   CREATE TABLE IF NOT EXISTS range_partition_table(
-      col_A String,
-      col_B Int,
-      col_C Long,
-      col_D Decimal(10,2),
-      col_E Long
+      col_A STRING,
+      col_B INT,
+      col_C LONG,
+      col_D DECIMAL(10,2),
+      col_E LONG
    ) partitioned by (col_F Timestamp)
    PARTITIONED BY 'carbondata'
    TBLPROPERTIES('PARTITION_TYPE'='RANGE',
@@ -800,12 +800,12 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
   Example:
   ```
   CREATE TABLE IF NOT EXISTS list_partition_table(
-      col_B Int,
-      col_C Long,
-      col_D Decimal(10,2),
-      col_E Long,
-      col_F Timestamp
-   ) PARTITIONED BY (col_A String)
+      col_B INT,
+      col_C LONG,
+      col_D DECIMAL(10,2),
+      col_E LONG,
+      col_F TIMESTAMP
+   ) PARTITIONED BY (col_A STRING)
    STORED BY 'carbondata'
    TBLPROPERTIES('PARTITION_TYPE'='LIST',
    'LIST_INFO'='aaaa, bbbb, (cccc, dddd), eeee')
@@ -861,22 +861,22 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
 
 
 ## PRE-AGGREGATE TABLES
-  Carbondata supports pre aggregating of data so that OLAP kind of queries can fetch data 
-  much faster.Aggregate tables are created as datamaps so that the handling is as efficient as 
-  other indexing support.Users can create as many aggregate tables they require as datamaps to 
-  improve their query performance,provided the storage requirements and loading speeds are 
+  CarbonData supports pre aggregating of data so that OLAP kind of queries can fetch data 
+  much faster. Aggregate tables are created as datamaps so that the handling is as efficient as 
+  other indexing support. Users can create as many aggregate tables they require as datamaps to 
+  improve their query performance, provided the storage requirements and loading speeds are 
   acceptable.
   
   For main table called **sales** which is defined as 
   
   ```
   CREATE TABLE sales (
-  order_time timestamp,
-  user_id string,
-  sex string,
-  country string,
-  quantity int,
-  price bigint)
+            order_time timestamp,
+            user_id STRING,
+            sex STRING,
+            country STRING,
+            quantity INT,
+            price BIGINT)
   STORED BY 'carbondata'
   ```
   
@@ -944,7 +944,7 @@ pre-aggregate table to fetch the data.
 ##### Compacting pre-aggregate tables
 Compaction command (ALTER TABLE COMPACT) need to be run separately on each pre-aggregate table.
 Running Compaction command on main table will **not automatically** compact the pre-aggregate 
-tables.Compaction is an optional operation for pre-aggregate table. If compaction is performed on
+tables. Compaction is an optional operation for pre-aggregate table. If compaction is performed on
 main table but not performed on pre-aggregate table, all queries still can benefit from 
 pre-aggregate tables. To further improve performance on pre-aggregate tables, compaction can be 
 triggered on pre-aggregate tables directly, it will merge the segments inside pre-aggregate table. 
@@ -963,7 +963,7 @@ This functionality is not supported.
 
   NOTE (<b>RESTRICTION</b>):
   Delete Segment operations are <b>not supported</b> on main table which has pre-aggregate tables 
-  created on it. All the pre-aggregate tables <b>will have to be dropped</b> before update/delete 
+  created on it. All the pre-aggregate tables <b>will have to be dropped</b> before delete segment 
   operations can be performed on the main table. Pre-aggregate tables can be rebuilt manually 
   after delete segment operations are completed
   
@@ -974,12 +974,12 @@ This functionality is not supported.
   Adding new column in new table does not have any affect on pre-aggregate tables. However if 
   dropping or renaming a column has impact in pre-aggregate table, such operations will be 
   rejected and error will be thrown. All the pre-aggregate tables <b>will have to be dropped</b> 
-  before Alter Operations can be performed on the main table. Pre-aggregate tables can be rebuilt 
-  manually after Alter Table operations are completed
+  before alter operations can be performed on the main table. Pre-aggregate tables can be rebuilt 
+  manually after alter table operations are completed
   
 ### Supporting timeseries data (Alpha feature in 1.3.0)
-Carbondata has built-in understanding of time hierarchy and levels: year, month, day, hour, minute.
-Multiple pre-aggregate tables can be created for the hierarchy and Carbondata can do automatic 
+CarbonData has built-in understanding of time hierarchy and levels: year, month, day, hour, minute, second.
+Timeseries pre-aggregate tables can be created for the hierarchy and CarbonData can do automatic 
 roll-up for the queries on these hierarchies.
 
   ```
@@ -1043,21 +1043,24 @@ roll-up for the queries on these hierarchies.
   SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
    avg(price) FROM sales GROUP BY order_time, country, sex
   ```
-  
-  For Querying data and automatically roll-up to the desired aggregation level,Carbondata supports 
+  For Querying data and automatically roll-up to the desired aggregation level, CarbonData supports 
   UDF as
   ```
   timeseries(timeseries column name, 'aggregation level')
   ```
+  Examples
   ```
-  Select timeseries(order_time, 'hour'), sum(quantity) from sales group by timeseries(order_time,
-  'hour')
+  SELECT 
+        timeseries(order_time, 'hour'), 
+        sum(quantity) 
+  FROM sales 
+  GROUP BY timeseries(order_time, 'hour')
   ```
   
   It is **not necessary** to create pre-aggregate tables for each granularity unless required for 
-  query. Carbondata can roll-up the data and fetch it.
+  query. CarbonData can roll-up the data and fetch it.
    
-  For Example: For main table **sales** , If pre-aggregate tables were created as  
+  For Example: For main table **sales**, If timeseries pre-aggregate tables were created as  
   
   ```
   CREATE DATAMAP agg_day
@@ -1091,10 +1094,10 @@ roll-up for the queries on these hierarchies.
   ```
   
   NOTE (<b>RESTRICTION</b>):
-  * Only value of 1 is supported for hierarchy levels. Other hierarchy levels are not supported. 
-  Other hierarchy levels are not supported
-  * pre-aggregate tables for the desired levels needs to be created one after the other
-  * pre-aggregate tables created for each level needs to be dropped separately 
+  * Only 1 is supported for granularity value of timeseries pre-aggregate table. Other granularity value are not supported.
+  * Only one granularity can be defined on creating one timeseries pre-aggregate table. Other granularity are created separately.
+  * Pre-aggregate tables for the desired levels needs to be created one after the other
+  * Pre-aggregate tables are created for each level needs to be dropped separately 
     
 
 ## BUCKETING
@@ -1119,14 +1122,14 @@ roll-up for the queries on these hierarchies.
   Example:
   ```
   CREATE TABLE IF NOT EXISTS productSchema.productSalesTable (
-                                productNumber Int,
-                                saleQuantity Int,
-                                productName String,
-                                storeCity String,
-                                storeProvince String,
-                                productCategory String,
-                                productBatch String,
-                                revenue Int)
+                                productNumber INT,
+                                saleQuantity INT,
+                                productName STRING,
+                                storeCity STRING,
+                                storeProvince STRING,
+                                productCategory STRING,
+                                productBatch STRING,
+                                revenue INT)
   STORED BY 'carbondata'
   TBLPROPERTIES ('BUCKETNUMBER'='4', 'BUCKETCOLUMNS'='productName')
   ```
@@ -1201,7 +1204,7 @@ roll-up for the queries on these hierarchies.
   NOTE:
   carbon.input.segments: Specifies the segment IDs to be queried. This property allows you to query specified segments of the specified table. The CarbonScan will read data from specified segments only.
   
-  If user wants to query with segments reading in multi threading mode, then CarbonSession.threadSet can be used instead of SET query.
+  If user wants to query with segments reading in multi threading mode, then CarbonSession. threadSet can be used instead of SET query.
   ```
   CarbonSession.threadSet ("carbon.input.segments.<database_name>.<table_name>","<list of segment IDs>");
   ```
@@ -1211,7 +1214,7 @@ roll-up for the queries on these hierarchies.
   SET carbon.input.segments.<database_name>.<table_name> = *;
   ```
   
-  If user wants to query with segments reading in multi threading mode, then CarbonSession.threadSet can be used instead of SET query. 
+  If user wants to query with segments reading in multi threading mode, then CarbonSession. threadSet can be used instead of SET query. 
   ```
   CarbonSession.threadSet ("carbon.input.segments.<database_name>.<table_name>","*");
   ```

[16/25] carbondata git commit: [CARBONDATA-2208]Pre aggregate datamap creation is failing when count(*) present in query

Posted by ra...@apache.org.

[CARBONDATA-2208]Pre aggregate datamap creation is failing when count(*) present in query

create datamap agg on table maintable using 'preaggregate' as select name, count(*) from maintable group by name

This closes #2004


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/bbe73767
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/bbe73767
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/bbe73767

Branch: refs/heads/branch-1.3
Commit: bbe73767887dabf9886dcaa8849ebac9256da6c5
Parents: 566217c
Author: kumarvishal <ku...@gmail.com>
Authored: Tue Feb 27 16:01:06 2018 +0530
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 18:03:27 2018 +0530

----------------------------------------------------------------------
 .../apache/carbondata/core/util/path/CarbonTablePath.java    | 7 ++++++-
 .../preaggregate/TestPreAggregateTableSelection.scala        | 7 +++++++
 .../execution/command/preaaggregate/PreAggregateUtil.scala   | 8 +++++---
 3 files changed, 18 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/bbe73767/core/src/main/java/org/apache/carbondata/core/util/path/CarbonTablePath.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/util/path/CarbonTablePath.java b/core/src/main/java/org/apache/carbondata/core/util/path/CarbonTablePath.java
index f232c23..b5fe5ea 100644
--- a/core/src/main/java/org/apache/carbondata/core/util/path/CarbonTablePath.java
+++ b/core/src/main/java/org/apache/carbondata/core/util/path/CarbonTablePath.java
@@ -75,7 +75,12 @@ public class CarbonTablePath extends Path {
    * @param carbonFilePath
    */
   public static String getFolderContainingFile(String carbonFilePath) {
-    return carbonFilePath.substring(0, carbonFilePath.lastIndexOf('/'));
+    int lastIndex = carbonFilePath.lastIndexOf('/');
+    // below code for handling windows environment
+    if (-1 == lastIndex) {
+      lastIndex = carbonFilePath.lastIndexOf(File.separator);
+    }
+    return carbonFilePath.substring(0, lastIndex);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/carbondata/blob/bbe73767/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/preaggregate/TestPreAggregateTableSelection.scala
----------------------------------------------------------------------
diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/preaggregate/TestPreAggregateTableSelection.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/preaggregate/TestPreAggregateTableSelection.scala
index 19d4abe..0f59949 100644
--- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/preaggregate/TestPreAggregateTableSelection.scala
+++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/preaggregate/TestPreAggregateTableSelection.scala
@@ -49,6 +49,7 @@ class TestPreAggregateTableSelection extends QueryTest with BeforeAndAfterAll {
     sql("create datamap agg6 on table mainTable using 'preaggregate' as select name,min(age) from mainTable group by name")
     sql("create datamap agg7 on table mainTable using 'preaggregate' as select name,max(age) from mainTable group by name")
     sql("create datamap agg8 on table maintable using 'preaggregate' as select name, sum(id), avg(id) from maintable group by name")
+    sql("create datamap agg9 on table maintable using 'preaggregate' as select name, count(*) from maintable group by name")
     sql("CREATE TABLE mainTableavg(id int, name string, city string, age bigint) STORED BY 'org.apache.carbondata.format'")
     sql("create datamap agg0 on table mainTableavg using 'preaggregate' as select name,sum(age), avg(age) from mainTableavg group by name")
     sql(s"LOAD DATA LOCAL INPATH '$resourcesPath/measureinsertintotest.csv' into table mainTable")
@@ -66,6 +67,11 @@ class TestPreAggregateTableSelection extends QueryTest with BeforeAndAfterAll {
     preAggTableValidator(df.queryExecution.analyzed, "maintable_agg0")
   }
 
+  test("test PreAggregate table selection with count(*)") {
+    val df = sql("select name, count(*) from mainTable group by name")
+    preAggTableValidator(df.queryExecution.analyzed, "maintable_agg9")
+  }
+
   test("test PreAggregate table selection 2") {
     val df = sql("select name from mainTable where name in (select name from mainTable) group by name")
     preAggTableValidator(df.queryExecution.analyzed, "mainTable")
@@ -342,6 +348,7 @@ test("test PreAggregate table selection with timeseries and normal together") {
     sql("drop table if exists lineitem")
     sql("DROP TABLE IF EXISTS maintabletime")
     sql("DROP TABLE IF EXISTS maintabledict")
+    sql("DROP TABLE IF EXISTS mainTableavg")
   }
 
 }

http://git-wip-us.apache.org/repos/asf/carbondata/blob/bbe73767/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/preaaggregate/PreAggregateUtil.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/preaaggregate/PreAggregateUtil.scala b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/preaaggregate/PreAggregateUtil.scala
index 0bee383..1bd12cd 100644
--- a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/preaaggregate/PreAggregateUtil.scala
+++ b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/preaaggregate/PreAggregateUtil.scala
@@ -346,11 +346,13 @@ object PreAggregateUtil {
         carbonTable)
     }
     // if parent column relation is of size more than one that means aggregate table
-    // column is derived from multiple column of main table
-    // or if expression is not a instance of attribute reference
+    // column is derived from multiple column of main table or if size is zero then it means
+    // column is present in select statement is some constants for example count(*)
+    // and if expression is not a instance of attribute reference
     // then use column name which is passed
     val columnName =
-    if (parentColumnsName.size > 1 && !expression.isInstanceOf[AttributeReference]) {
+    if ((parentColumnsName.size > 1 || parentColumnsName.isEmpty) &&
+        !expression.isInstanceOf[AttributeReference]) {
       newColumnName
     } else {
       expression.asInstanceOf[AttributeReference].name

[07/25] carbondata git commit: [CARBONDATA-2135] Documentation for Table comment and Column Comment

Posted by ra...@apache.org.

[CARBONDATA-2135] Documentation for Table comment and Column Comment

Documentation for table comment and column comment

This closes #1936


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/433bdf3b
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/433bdf3b
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/433bdf3b

Branch: refs/heads/branch-1.3
Commit: 433bdf3bf6c93bf1890eb71cd3bd5813628575a0
Parents: e5d9802
Author: sgururajshetty <sg...@gmail.com>
Authored: Tue Feb 6 16:06:42 2018 +0530
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 17:46:35 2018 +0530

----------------------------------------------------------------------
 docs/data-management-on-carbondata.md | 99 +++++++++++++++++++++---------
 1 file changed, 71 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/433bdf3b/docs/data-management-on-carbondata.md
----------------------------------------------------------------------
diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md
index 78ab010..9678a32 100644
--- a/docs/data-management-on-carbondata.md
+++ b/docs/data-management-on-carbondata.md
@@ -286,7 +286,40 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
   * If the table is aggregate table, then all the aggregate tables should be copied to the new database location.
   * For old store, the time zone of the source and destination cluster should be same.
   * If old cluster used HIVE meta store to store schema, refresh will not work as schema file does not exist in file system.
+
+### Table and Column Comment
+
+  You can provide more information on table by using table comment. Similarly you can provide more information about a particular column using column comment. 
+  You can see the column comment of an existing table using describe formatted command.
   
+  ```
+  CREATE TABLE [IF NOT EXISTS] [db_name.]table_name[(col_name data_type [COMMENT col_comment], ...)]
+    [COMMENT table_comment]
+  STORED BY 'carbondata'
+  [TBLPROPERTIES (property_name=property_value, ...)]
+  ```
+  
+  Example:
+  ```
+  CREATE TABLE IF NOT EXISTS productSchema.productSalesTable (
+                                productNumber Int COMMENT 'unique serial number for product')
+  COMMENT “This is table comment”
+   STORED BY 'carbondata'
+   TBLPROPERTIES ('DICTIONARY_INCLUDE'='productNumber')
+  ```
+  You can also SET and UNSET table comment using ALTER command.
+
+  Example to SET table comment:
+  
+  ```
+  ALTER TABLE carbon SET TBLPROPERTIES ('comment'='this table comment is modified');
+  ```
+  
+  Example to UNSET table comment:
+  
+  ```
+  ALTER TABLE carbon UNSET TBLPROPERTIES ('comment');
+  ```
 
 ## LOAD DATA
 
@@ -954,9 +987,9 @@ roll-up for the queries on these hierarchies.
   ON TABLE sales
   USING "timeseries"
   DMPROPERTIES (
-  'event_time’=’order_time’,
-  'year_granularity’=’1’)
-  AS
+  'event_time'='order_time',
+  'year_granualrity'='1',
+  ) AS
   SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
    avg(price) FROM sales GROUP BY order_time, country, sex
     
@@ -964,9 +997,9 @@ roll-up for the queries on these hierarchies.
   ON TABLE sales
   USING "timeseries"
   DMPROPERTIES (
-  'event_time’=’order_time’,
-  'month_granularity’=’1’)
-  AS
+  'event_time'='order_time',
+  'month_granualrity'='1',
+  ) AS
   SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
    avg(price) FROM sales GROUP BY order_time, country, sex
     
@@ -974,9 +1007,9 @@ roll-up for the queries on these hierarchies.
   ON TABLE sales
   USING "timeseries"
   DMPROPERTIES (
-  'event_time’=’order_time’,
-  'day_granularity’=’1’)
-  AS
+  'event_time'='order_time',
+  'day_granualrity'='1',
+  ) AS
   SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
    avg(price) FROM sales GROUP BY order_time, country, sex
         
@@ -984,9 +1017,9 @@ roll-up for the queries on these hierarchies.
   ON TABLE sales
   USING "timeseries"
   DMPROPERTIES (
-  'event_time’=’order_time’,
-  'hour_granularity’=’1’)
-  AS
+  'event_time'='order_time',
+  'hour_granualrity'='1',
+  ) AS
   SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
    avg(price) FROM sales GROUP BY order_time, country, sex
   
@@ -994,9 +1027,19 @@ roll-up for the queries on these hierarchies.
   ON TABLE sales
   USING "timeseries"
   DMPROPERTIES (
-  'event_time’=’order_time’,
-  'minute_granularity’=’1’)
-  AS
+  'event_time'='order_time',
+  'minute_granualrity'='1',
+  ) AS
+  SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+   avg(price) FROM sales GROUP BY order_time, country, sex
+    
+  CREATE DATAMAP agg_minute
+  ON TABLE sales
+  USING "timeseries"
+  DMPROPERTIES (
+  'event_time'='order_time',
+  'minute_granualrity'='1',
+  ) AS
   SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
    avg(price) FROM sales GROUP BY order_time, country, sex
   ```
@@ -1004,11 +1047,11 @@ roll-up for the queries on these hierarchies.
   For Querying data and automatically roll-up to the desired aggregation level,Carbondata supports 
   UDF as
   ```
-  timeseries(timeseries column name, ‘aggregation level’)
+  timeseries(timeseries column name, 'aggregation level')
   ```
   ```
-  Select timeseries(order_time, ‘hour’), sum(quantity) from sales group by timeseries(order_time,
-  ’hour’)
+  Select timeseries(order_time, 'hour'), sum(quantity) from sales group by timeseries(order_time,
+  'hour')
   ```
   
   It is **not necessary** to create pre-aggregate tables for each granularity unless required for 
@@ -1021,9 +1064,9 @@ roll-up for the queries on these hierarchies.
     ON TABLE sales
     USING "timeseries"
     DMPROPERTIES (
-    'event_time’=’order_time’,
-    'day_granularity’=’1’)
-    AS
+    'event_time'='order_time',
+    'day_granualrity'='1',
+    ) AS
     SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
      avg(price) FROM sales GROUP BY order_time, country, sex
           
@@ -1031,20 +1074,20 @@ roll-up for the queries on these hierarchies.
     ON TABLE sales
     USING "timeseries"
     DMPROPERTIES (
-    'event_time’=’order_time’,
-    'hour_granularity’=’1’)
-    AS
+    'event_time'='order_time',
+    'hour_granualrity'='1',
+    ) AS
     SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
      avg(price) FROM sales GROUP BY order_time, country, sex
   ```
   
   Queries like below will be rolled-up and fetched from pre-aggregate tables
   ```
-  Select timeseries(order_time, ‘month’), sum(quantity) from sales group by timeseries(order_time,
-    ’month’)
+  Select timeseries(order_time, 'month'), sum(quantity) from sales group by timeseries(order_time,
+    'month')
     
-  Select timeseries(order_time, ‘year’), sum(quantity) from sales group by timeseries(order_time,
-    ’year’)
+  Select timeseries(order_time, 'year'), sum(quantity) from sales group by timeseries(order_time,
+    'year')
   ```
   
   NOTE (<b>RESTRICTION</b>):

[20/25] carbondata git commit: [CARBONDATA-2196] Take CarbonTable from loadmodel during streaming ingestion

Posted by ra...@apache.org.

[CARBONDATA-2196] Take CarbonTable from loadmodel during streaming ingestion

This closes #1991


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/65234b27
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/65234b27
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/65234b27

Branch: refs/heads/branch-1.3
Commit: 65234b27df8abd16fa3e7e21c0bf72fa5440aa4e
Parents: 6d3105b
Author: rahulforallp <ra...@knoldus.in>
Authored: Thu Feb 22 18:29:57 2018 +0530
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 18:05:04 2018 +0530

----------------------------------------------------------------------
 .../carbondata/core/datamap/TableDataMap.java   |  6 ++--
 .../hadoop/api/CarbonTableInputFormat.java      |  2 +-
 .../merger/CompactionResultSortProcessor.java   |  6 ++--
 .../util/CarbonDataProcessorUtil.java           | 36 ++++++++++++++++----
 4 files changed, 35 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/65234b27/core/src/main/java/org/apache/carbondata/core/datamap/TableDataMap.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/datamap/TableDataMap.java b/core/src/main/java/org/apache/carbondata/core/datamap/TableDataMap.java
index 6555d6c..020d6c9 100644
--- a/core/src/main/java/org/apache/carbondata/core/datamap/TableDataMap.java
+++ b/core/src/main/java/org/apache/carbondata/core/datamap/TableDataMap.java
@@ -176,16 +176,16 @@ public final class TableDataMap extends OperationEventListener {
    * @return
    * @throws IOException
    */
-  public List<String> pruneSegments(List<Segment> segments, FilterResolverIntf filterExp)
+  public List<Segment> pruneSegments(List<Segment> segments, FilterResolverIntf filterExp)
       throws IOException {
-    List<String> prunedSegments = new ArrayList<>(CarbonCommonConstants.DEFAULT_COLLECTION_SIZE);
+    List<Segment> prunedSegments = new ArrayList<>(CarbonCommonConstants.DEFAULT_COLLECTION_SIZE);
     for (Segment segment : segments) {
       List<DataMap> dataMaps = dataMapFactory.getDataMaps(segment);
       for (DataMap dataMap : dataMaps) {
         if (dataMap.isScanRequired(filterExp)) {
           // If any one task in a given segment contains the data that means the segment need to
           // be scanned and we need to validate further data maps in the same segment
-          prunedSegments.add(segment.getSegmentNo());
+          prunedSegments.add(segment);
           break;
         }
       }

http://git-wip-us.apache.org/repos/asf/carbondata/blob/65234b27/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonTableInputFormat.java
----------------------------------------------------------------------
diff --git a/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonTableInputFormat.java b/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonTableInputFormat.java
index 96b0b21..3dbf04f 100644
--- a/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonTableInputFormat.java
+++ b/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonTableInputFormat.java
@@ -910,7 +910,7 @@ public class CarbonTableInputFormat<T> extends FileInputFormat<Void, T> {
   /**
    * return valid segment to access
    */
-  private Segment[] getSegmentsToAccess(JobContext job) {
+  public Segment[] getSegmentsToAccess(JobContext job) {
     String segmentString = job.getConfiguration().get(INPUT_SEGMENT_NUMBERS, "");
     if (segmentString.trim().isEmpty()) {
       return new Segment[0];

http://git-wip-us.apache.org/repos/asf/carbondata/blob/65234b27/processing/src/main/java/org/apache/carbondata/processing/merger/CompactionResultSortProcessor.java
----------------------------------------------------------------------
diff --git a/processing/src/main/java/org/apache/carbondata/processing/merger/CompactionResultSortProcessor.java b/processing/src/main/java/org/apache/carbondata/processing/merger/CompactionResultSortProcessor.java
index 2fbdf4f..e7c4502 100644
--- a/processing/src/main/java/org/apache/carbondata/processing/merger/CompactionResultSortProcessor.java
+++ b/processing/src/main/java/org/apache/carbondata/processing/merger/CompactionResultSortProcessor.java
@@ -430,9 +430,7 @@ public class CompactionResultSortProcessor extends AbstractResultProcessor {
    */
   private void initTempStoreLocation() {
     tempStoreLocation = CarbonDataProcessorUtil
-        .getLocalDataFolderLocation(carbonLoadModel.getDatabaseName(), tableName,
-            carbonLoadModel.getTaskNo(), carbonLoadModel.getPartitionId(), segmentId,
-            true, false);
+        .getLocalDataFolderLocation(carbonTable, tableName, carbonLoadModel.getTaskNo(),
+            carbonLoadModel.getPartitionId(), segmentId, true, false);
   }
-
 }

http://git-wip-us.apache.org/repos/asf/carbondata/blob/65234b27/processing/src/main/java/org/apache/carbondata/processing/util/CarbonDataProcessorUtil.java
----------------------------------------------------------------------
diff --git a/processing/src/main/java/org/apache/carbondata/processing/util/CarbonDataProcessorUtil.java b/processing/src/main/java/org/apache/carbondata/processing/util/CarbonDataProcessorUtil.java
index 1e648e1..cfc9fa3 100644
--- a/processing/src/main/java/org/apache/carbondata/processing/util/CarbonDataProcessorUtil.java
+++ b/processing/src/main/java/org/apache/carbondata/processing/util/CarbonDataProcessorUtil.java
@@ -117,22 +117,25 @@ public final class CarbonDataProcessorUtil {
       }
     }
   }
+
   /**
+   *
    * This method will form the local data folder store location
    *
-   * @param databaseName
-   * @param tableName
+   * @param carbonTable
    * @param taskId
    * @param partitionId
    * @param segmentId
+   * @param isCompactionFlow
+   * @param isAltPartitionFlow
    * @return
    */
-  public static String[] getLocalDataFolderLocation(String databaseName, String tableName,
+  public static String[] getLocalDataFolderLocation(CarbonTable carbonTable, String tableName,
       String taskId, String partitionId, String segmentId, boolean isCompactionFlow,
       boolean isAltPartitionFlow) {
     String tempLocationKey =
-        getTempStoreLocationKey(databaseName, tableName, segmentId, taskId, isCompactionFlow,
-            isAltPartitionFlow);
+        getTempStoreLocationKey(carbonTable.getDatabaseName(), tableName,
+            segmentId, taskId, isCompactionFlow, isAltPartitionFlow);
     String baseTempStorePath = CarbonProperties.getInstance()
         .getProperty(tempLocationKey);
     if (baseTempStorePath == null) {
@@ -145,7 +148,6 @@ public final class CarbonDataProcessorUtil {
     String[] baseTmpStorePathArray = StringUtils.split(baseTempStorePath, File.pathSeparator);
     String[] localDataFolderLocArray = new String[baseTmpStorePathArray.length];
 
-    CarbonTable carbonTable = CarbonMetadata.getInstance().getCarbonTable(databaseName, tableName);
     for (int i = 0 ; i < baseTmpStorePathArray.length; i++) {
       String tmpStore = baseTmpStorePathArray[i];
       CarbonTablePath carbonTablePath =
@@ -159,6 +161,26 @@ public final class CarbonDataProcessorUtil {
   }
 
   /**
+   * This method will form the local data folder store location
+   *
+   * @param databaseName
+   * @param tableName
+   * @param taskId
+   * @param partitionId
+   * @param segmentId
+   * @param isCompactionFlow
+   * @param isAltPartitionFlow
+   * @return
+   */
+  public static String[] getLocalDataFolderLocation(String databaseName, String tableName,
+      String taskId, String partitionId, String segmentId, boolean isCompactionFlow,
+      boolean isAltPartitionFlow) {
+    CarbonTable carbonTable = CarbonMetadata.getInstance().getCarbonTable(databaseName, tableName);
+    return getLocalDataFolderLocation(carbonTable, tableName, taskId, partitionId,
+        segmentId, isCompactionFlow, isAltPartitionFlow);
+  }
+
+  /**
    * This method will form the key for getting the temporary location set in carbon properties
    *
    * @param databaseName
@@ -587,4 +609,4 @@ public final class CarbonDataProcessorUtil {
     return isRawDataRequired;
   }
 
-}
\ No newline at end of file
+}

[13/25] carbondata git commit: [HOTFIX] Check concurrent loading in compaction command

Posted by ra...@apache.org.

[HOTFIX] Check concurrent loading in compaction command

In compaction command, concurrent loading should be checked in processData method

This closes #2002


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/abb0a0b2
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/abb0a0b2
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/abb0a0b2

Branch: refs/heads/branch-1.3
Commit: abb0a0b267430b94e988e26464108fce8c56dbf4
Parents: 36e770c
Author: Jacky Li <ja...@qq.com>
Authored: Tue Feb 27 14:24:41 2018 +0800
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 18:00:46 2018 +0530

----------------------------------------------------------------------
 .../command/management/CarbonAlterTableCompactionCommand.scala | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/abb0a0b2/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonAlterTableCompactionCommand.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonAlterTableCompactionCommand.scala b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonAlterTableCompactionCommand.scala
index 7e3b699..f6019e4 100644
--- a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonAlterTableCompactionCommand.scala
+++ b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonAlterTableCompactionCommand.scala
@@ -83,13 +83,13 @@ case class CarbonAlterTableCompactionCommand(
       val loadMetadataEvent = new LoadMetadataEvent(table, true)
       OperationListenerBus.getInstance().fireEvent(loadMetadataEvent, operationContext)
     }
-    if (SegmentStatusManager.isLoadInProgressInTable(table)) {
-      throw new ConcurrentOperationException(table, "loading", "compaction")
-    }
     Seq.empty
   }
 
   override def processData(sparkSession: SparkSession): Seq[Row] = {
+    if (SegmentStatusManager.isLoadInProgressInTable(table)) {
+      throw new ConcurrentOperationException(table, "loading", "compaction")
+    }
     operationContext.setProperty("compactionException", "true")
     var compactionType: CompactionType = null
     var compactionException = "true"

[02/25] carbondata git commit: [HOTFIX] Fix documentation errors.Add examples for pre-aggregate usage

Posted by ra...@apache.org.

[HOTFIX] Fix documentation errors.Add examples for pre-aggregate usage

Fix documentation errors.Add examples for pre-aggregate usage

This closes #1945


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/ff2a2134
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/ff2a2134
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/ff2a2134

Branch: refs/heads/branch-1.3
Commit: ff2a2134401f54eb6fd57e818500b2401d486c50
Parents: b58de09
Author: Raghunandan S <ca...@gmail.com>
Authored: Wed Feb 7 17:27:51 2018 +0530
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 17:39:33 2018 +0530

----------------------------------------------------------------------
 docs/data-management-on-carbondata.md           | 72 ++++++++------------
 .../examples/PreAggregateTableExample.scala     | 24 +++++++
 2 files changed, 51 insertions(+), 45 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/ff2a2134/docs/data-management-on-carbondata.md
----------------------------------------------------------------------
diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md
index 18ad5b8..c846ffc 100644
--- a/docs/data-management-on-carbondata.md
+++ b/docs/data-management-on-carbondata.md
@@ -627,21 +627,21 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
   
   ```
   LOAD DATA [LOCAL] INPATH 'folder_path' 
-    INTO TABLE [db_name.]table_name PARTITION (partition_spec) 
-    OPTIONS(property_name=property_value, ...)
-  NSERT INTO INTO TABLE [db_name.]table_name PARTITION (partition_spec) SELECT STATMENT 
+  INTO TABLE [db_name.]table_name PARTITION (partition_spec) 
+  OPTIONS(property_name=property_value, ...)
+    
+  INSERT INTO INTO TABLE [db_name.]table_name PARTITION (partition_spec) <SELECT STATMENT>
   ```
   
   Example:
   ```
-  LOAD DATA LOCAL INPATH '${env:HOME}/staticinput.txt'
-    INTO TABLE locationTable
-    PARTITION (country = 'US', state = 'CA')
+  LOAD DATA LOCAL INPATH '${env:HOME}/staticinput.csv'
+  INTO TABLE locationTable
+  PARTITION (country = 'US', state = 'CA')
     
   INSERT INTO TABLE locationTable
-    PARTITION (country = 'US', state = 'AL')
-    SELECT * FROM another_user au 
-    WHERE au.country = 'US' AND au.state = 'AL';
+  PARTITION (country = 'US', state = 'AL')
+  SELECT <columns list excluding partition columns> FROM another_user
   ```
 
 #### Load Data Using Dynamic Partition
@@ -650,12 +650,11 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
 
   Example:
   ```
-  LOAD DATA LOCAL INPATH '${env:HOME}/staticinput.txt'
-    INTO TABLE locationTable
+  LOAD DATA LOCAL INPATH '${env:HOME}/staticinput.csv'
+  INTO TABLE locationTable
           
   INSERT INTO TABLE locationTable
-    SELECT * FROM another_user au 
-    WHERE au.country = 'US' AND au.state = 'AL';
+  SELECT <columns list excluding partition columns> FROM another_user
   ```
 
 #### Show Partitions
@@ -679,19 +678,19 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
   
   ```
    INSERT OVERWRITE TABLE table_name
-    PARTITION (column = 'partition_name')
-    select_statement
+   PARTITION (column = 'partition_name')
+   select_statement
   ```
   
   Example:
   ```
   INSERT OVERWRITE TABLE partitioned_user
-    PARTITION (country = 'US')
-    SELECT * FROM another_user au 
-    WHERE au.country = 'US';
+  PARTITION (country = 'US')
+  SELECT * FROM another_user au 
+  WHERE au.country = 'US';
   ```
 
-### CARBONDATA PARTITION(HASH,RANGE,LIST) -- Alpha feature, this partition not supports update and delete data.
+### CARBONDATA PARTITION(HASH,RANGE,LIST) -- Alpha feature, this partition feature does not support update and delete data.
 
   The partition supports three type:(Hash,Range,List), similar to other system's partition features, CarbonData's partition feature can be used to improve query performance by filtering on the partition column.
 
@@ -886,11 +885,11 @@ will be transformed by Query Planner to fetch data from pre-aggregate table **ag
 
 But queries of kind
 ```
-SELECT user_id, country, sex, sum(quantity), avg(price) from sales GROUP BY country, sex
+SELECT user_id, country, sex, sum(quantity), avg(price) from sales GROUP BY user_id, country, sex
 
 SELECT sex, avg(quantity) from sales GROUP BY sex
 
-SELECT max(price), country from sales GROUP BY country
+SELECT country, max(price) from sales GROUP BY country
 ```
 
 will fetch the data from the main table **sales**
@@ -910,18 +909,13 @@ pre-aggregate tables satisfy the query condition, the plan is transformed automa
 pre-aggregate table to fetch the data
 
 ##### Compacting pre-aggregate tables
-Compaction is an optional operation for pre-aggregate table. If compaction is performed on main 
-table but not performed on pre-aggregate table, all queries still can benefit from pre-aggregate 
-table.To further improve performance on pre-aggregate table, compaction can be triggered on 
-pre-aggregate tables directly, it will merge the segments inside pre-aggregation table. 
-To do that, use ALTER TABLE COMPACT command on the pre-aggregate table just like the main table
+Compaction command (ALTER TABLE COMPACT) need to be run separately on each pre-aggregate table.
+Running Compaction command on main table will **not automatically** compact the pre-aggregate 
+tables.Compaction is an optional operation for pre-aggregate table. If compaction is performed on
+main table but not performed on pre-aggregate table, all queries still can benefit from 
+pre-aggregate tables.To further improve performance on pre-aggregate tables, compaction can be 
+triggered on pre-aggregate tables directly, it will merge the segments inside pre-aggregate table. 
 
-  NOTE:
-  * If the aggregate function used in the pre-aggregate table creation included distinct-count,
-     during compaction, the pre-aggregate table values are recomputed.This would a costly 
-     operation as compared to the compaction of pre-aggregate tables containing other aggregate 
-     functions alone
- 
 ##### Update/Delete Operations on pre-aggregate tables
 This functionality is not supported.
 
@@ -1005,16 +999,6 @@ roll-up for the queries on these hierarchies.
   ) AS
   SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
    avg(price) FROM sales GROUP BY order_time, country, sex
-    
-  CREATE DATAMAP agg_minute
-  ON TABLE sales
-  USING "timeseries"
-  DMPROPERTIES (
-  'event_time’=’order_time’,
-  'minute_granualrity’=’1’,
-  ) AS
-  SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
-   avg(price) FROM sales GROUP BY order_time, country, sex
   ```
   
   For Querying data and automatically roll-up to the desired aggregation level,Carbondata supports 
@@ -1028,9 +1012,7 @@ roll-up for the queries on these hierarchies.
   ```
   
   It is **not necessary** to create pre-aggregate tables for each granularity unless required for 
-  query
-  .Carbondata
-   can roll-up the data and fetch it
+  query.Carbondata can roll-up the data and fetch it.
    
   For Example: For main table **sales** , If pre-aggregate tables were created as  
   

http://git-wip-us.apache.org/repos/asf/carbondata/blob/ff2a2134/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateTableExample.scala
----------------------------------------------------------------------
diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateTableExample.scala b/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateTableExample.scala
index fe3a93d..d27eefb 100644
--- a/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateTableExample.scala
+++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateTableExample.scala
@@ -135,6 +135,30 @@ object PreAggregateTableExample {
     println("time for query on table without pre-aggregate table:" + time_without_aggTable.toString)
     // scalastyle:on
 
+    // 3. if avg function is defined for a column, sum also can be used on that;but not other way
+    // round
+    val time_without_aggTable_sum = time {
+      spark.sql(
+        s"""
+           | SELECT id, sum(age)
+           | FROM personTableWithoutAgg group by id
+      """.stripMargin).count()
+    }
+
+    val time_with_aggTable_sum = time {
+      spark.sql(
+        s"""
+           | SELECT id, sum(age)
+           | FROM personTable group by id
+      """.stripMargin).count()
+    }
+    // scalastyle:off
+    println("time for query with function sum on table with pre-aggregate table:" +
+      time_with_aggTable_sum.toString)
+    println("time for query with function sum on table without pre-aggregate table:" +
+      time_without_aggTable_sum.toString)
+    // scalastyle:on
+
     spark.sql("DROP TABLE IF EXISTS mainTable")
     spark.sql("DROP TABLE IF EXISTS personTable")
     spark.sql("DROP TABLE IF EXISTS personTableWithoutAgg")

[09/25] carbondata git commit: [CARBONDATA-2215][Documentation] Describe CarbonStreamParser in streaming-guide.md

Posted by ra...@apache.org.

[CARBONDATA-2215][Documentation] Describe CarbonStreamParser in streaming-guide.md

This closes #2016


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/87361a80
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/87361a80
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/87361a80

Branch: refs/heads/branch-1.3
Commit: 87361a8069503a6a3fa6b31e54ed9849259c81c9
Parents: 28c3701
Author: Zhang Zhichao <44...@qq.com>
Authored: Wed Feb 28 23:07:38 2018 +0800
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 17:47:42 2018 +0530

----------------------------------------------------------------------
 docs/streaming-guide.md | 74 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/87361a80/docs/streaming-guide.md
----------------------------------------------------------------------
diff --git a/docs/streaming-guide.md b/docs/streaming-guide.md
index 201f8e0..aa9eaef 100644
--- a/docs/streaming-guide.md
+++ b/docs/streaming-guide.md
@@ -152,6 +152,80 @@ property name | default | description
 --- | --- | ---
 carbon.streaming.auto.handoff.enabled | true | whether to auto trigger handoff operation
 
+## Stream data parser
+Config the property "carbon.stream.parser" to define a stream parser to convert InternalRow to Object[] when write stream data.
+
+property name | default | description
+--- | --- | ---
+carbon.stream.parser | org.apache.carbondata.streaming.parser.CSVStreamParserImp | the class of the stream parser
+
+Currently CarbonData support two parsers, as following:
+
+**1. org.apache.carbondata.streaming.parser.CSVStreamParserImp**: This is the default stream parser, it gets a line data(String type) from the first index of InternalRow and converts this String to Object[].
+
+**2. org.apache.carbondata.streaming.parser.RowStreamParserImp**: This stream parser will auto convert InternalRow to Object[] according to schema of this `DataSet`, for example:
+
+```scala
+ case class FileElement(school: Array[String], age: Int)
+ case class StreamData(id: Int, name: String, city: String, salary: Float, file: FileElement)
+ ...
+
+ var qry: StreamingQuery = null
+ val readSocketDF = spark.readStream
+   .format("socket")
+   .option("host", "localhost")
+   .option("port", 9099)
+   .load()
+   .as[String]
+   .map(_.split(","))
+   .map { fields => {
+     val tmp = fields(4).split("\\$")
+     val file = FileElement(tmp(0).split(":"), tmp(1).toInt)
+     StreamData(fields(0).toInt, fields(1), fields(2), fields(3).toFloat, file)
+   } }
+
+ // Write data from socket stream to carbondata file
+ qry = readSocketDF.writeStream
+   .format("carbondata")
+   .trigger(ProcessingTime("5 seconds"))
+   .option("checkpointLocation", tablePath.getStreamingCheckpointDir)
+   .option("dbName", "default")
+   .option("tableName", "carbon_table")
+   .option(CarbonStreamParser.CARBON_STREAM_PARSER,
+     CarbonStreamParser.CARBON_STREAM_PARSER_ROW_PARSER)
+   .start()
+
+ ...
+```
+
+### How to implement a customized stream parser
+If user needs to implement a customized stream parser to convert a specific InternalRow to Object[], it needs to implement `initialize` method and `parserRow` method of interface `CarbonStreamParser`, for example:
+
+```scala
+ package org.XXX.XXX.streaming.parser
+ 
+ import org.apache.hadoop.conf.Configuration
+ import org.apache.spark.sql.catalyst.InternalRow
+ import org.apache.spark.sql.types.StructType
+ 
+ class XXXStreamParserImp extends CarbonStreamParser {
+ 
+   override def initialize(configuration: Configuration, structType: StructType): Unit = {
+     // user can get the properties from "configuration"
+   }
+   
+   override def parserRow(value: InternalRow): Array[Object] = {
+     // convert InternalRow to Object[](Array[Object] in Scala) 
+   }
+   
+   override def close(): Unit = {
+   }
+ }
+   
+```
+
+and then set the property "carbon.stream.parser" to "org.XXX.XXX.streaming.parser.XXXStreamParserImp".
+
 ## Close streaming table
 Use below command to handoff all streaming segments to columnar format segments and modify the streaming property to false, this table becomes a normal table.
 ```sql

[14/25] carbondata git commit: [CARBONDATA-2184]Improve memory reuse for heap memory in `HeapMemoryAllocator`

Posted by ra...@apache.org.

[CARBONDATA-2184]Improve memory reuse for heap memory in `HeapMemoryAllocator`

The description in [SPARK-21860|https://issues.apache.org/jira/browse/SPARK-21860]:
In `HeapMemoryAllocator`, when allocating memory from pool, and the key of pool is memory size.
Actually some size of memory ,such as 1025bytes,1026bytes,......1032bytes, we can think they are the same，because we allocate memory in multiples of 8 bytes.
In this case, we can improve memory reuse.

This closes #1982


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/55fe349d
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/55fe349d
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/55fe349d

Branch: refs/heads/branch-1.3
Commit: 55fe349d0b1f731565a471c42d37eac971e46168
Parents: abb0a0b
Author: Zhang Zhichao <44...@qq.com>
Authored: Sun Feb 18 00:55:04 2018 +0800
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 18:02:23 2018 +0530

----------------------------------------------------------------------
 .../core/memory/HeapMemoryAllocator.java        | 52 +++++++++-----
 .../carbondata/core/util/CarbonProperties.java  | 19 ++++++
 .../core/memory/MemoryAllocatorUnitTest.java    | 71 ++++++++++++++++++++
 3 files changed, 127 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/55fe349d/core/src/main/java/org/apache/carbondata/core/memory/HeapMemoryAllocator.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/memory/HeapMemoryAllocator.java b/core/src/main/java/org/apache/carbondata/core/memory/HeapMemoryAllocator.java
index 5862933..242995b 100644
--- a/core/src/main/java/org/apache/carbondata/core/memory/HeapMemoryAllocator.java
+++ b/core/src/main/java/org/apache/carbondata/core/memory/HeapMemoryAllocator.java
@@ -23,16 +23,27 @@ import java.util.LinkedList;
 import java.util.Map;
 import javax.annotation.concurrent.GuardedBy;
 
+import org.apache.carbondata.core.util.CarbonProperties;
+
 /**
  * Code ported from Apache Spark {org.apache.spark.unsafe.memory} package
  * A simple {@link MemoryAllocator} that can allocate up to 16GB using a JVM long primitive array.
  */
 public class HeapMemoryAllocator implements MemoryAllocator {
 
-  @GuardedBy("this") private final Map<Long, LinkedList<WeakReference<MemoryBlock>>>
+  @GuardedBy("this") private final Map<Long, LinkedList<WeakReference<long[]>>>
       bufferPoolsBySize = new HashMap<>();
 
-  private static final int POOLING_THRESHOLD_BYTES = 1024 * 1024;
+  private int poolingThresholdBytes;
+  private boolean shouldPooling = true;
+
+  public HeapMemoryAllocator() {
+    poolingThresholdBytes = CarbonProperties.getInstance().getHeapMemoryPoolingThresholdBytes();
+    // if set 'poolingThresholdBytes' to -1, it should not go through the pooling mechanism.
+    if (poolingThresholdBytes == -1) {
+      shouldPooling = false;
+    }
+  }
 
   /**
    * Returns true if allocations of the given size should go through the pooling mechanism and
@@ -40,42 +51,53 @@ public class HeapMemoryAllocator implements MemoryAllocator {
    */
   private boolean shouldPool(long size) {
     // Very small allocations are less likely to benefit from pooling.
-    return size >= POOLING_THRESHOLD_BYTES;
+    return shouldPooling && (size >= poolingThresholdBytes);
   }
 
   @Override public MemoryBlock allocate(long size) throws OutOfMemoryError {
-    if (shouldPool(size)) {
+    int numWords = (int) ((size + 7) / 8);
+    long alignedSize = numWords * 8L;
+    assert (alignedSize >= size);
+    if (shouldPool(alignedSize)) {
       synchronized (this) {
-        final LinkedList<WeakReference<MemoryBlock>> pool = bufferPoolsBySize.get(size);
+        final LinkedList<WeakReference<long[]>> pool = bufferPoolsBySize.get(alignedSize);
         if (pool != null) {
           while (!pool.isEmpty()) {
-            final WeakReference<MemoryBlock> blockReference = pool.pop();
-            final MemoryBlock memory = blockReference.get();
-            if (memory != null) {
-              assert (memory.size() == size);
+            final WeakReference<long[]> arrayReference = pool.pop();
+            final long[] array = arrayReference.get();
+            if (array != null) {
+              assert (array.length * 8L >= size);
+              MemoryBlock memory = new MemoryBlock(array, CarbonUnsafe.LONG_ARRAY_OFFSET, size);
               // reuse this MemoryBlock
               memory.setFreedStatus(false);
               return memory;
             }
           }
-          bufferPoolsBySize.remove(size);
+          bufferPoolsBySize.remove(alignedSize);
         }
       }
     }
-    long[] array = new long[(int) ((size + 7) / 8)];
+    long[] array = new long[numWords];
     return new MemoryBlock(array, CarbonUnsafe.LONG_ARRAY_OFFSET, size);
   }
 
   @Override public void free(MemoryBlock memory) {
     final long size = memory.size();
-    if (shouldPool(size)) {
+
+    // As an additional layer of defense against use-after-free bugs, we mutate the
+    // MemoryBlock to null out its reference to the long[] array.
+    long[] array = (long[]) memory.obj;
+    memory.setObjAndOffset(null, 0);
+
+    long alignedSize = ((size + 7) / 8) * 8;
+    if (shouldPool(alignedSize)) {
       synchronized (this) {
-        LinkedList<WeakReference<MemoryBlock>> pool = bufferPoolsBySize.get(size);
+        LinkedList<WeakReference<long[]>> pool = bufferPoolsBySize.get(alignedSize);
         if (pool == null) {
           pool = new LinkedList<>();
-          bufferPoolsBySize.put(size, pool);
+          bufferPoolsBySize.put(alignedSize, pool);
         }
-        pool.add(new WeakReference<>(memory));
+        pool.add(new WeakReference<>(array));
       }
     }
     memory.setFreedStatus(true);

http://git-wip-us.apache.org/repos/asf/carbondata/blob/55fe349d/core/src/main/java/org/apache/carbondata/core/util/CarbonProperties.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/util/CarbonProperties.java b/core/src/main/java/org/apache/carbondata/core/util/CarbonProperties.java
index 3dc7b8f..667c45c 100644
--- a/core/src/main/java/org/apache/carbondata/core/util/CarbonProperties.java
+++ b/core/src/main/java/org/apache/carbondata/core/util/CarbonProperties.java
@@ -1351,4 +1351,23 @@ public final class CarbonProperties {
         unsafeSortStorageMemory + "");
   }
 
+  /**
+   * Get the heap memory pooling threshold bytes.
+   */
+  public int getHeapMemoryPoolingThresholdBytes() {
+    int thresholdSize;
+    try {
+      thresholdSize = Integer.parseInt(CarbonProperties.getInstance()
+          .getProperty(CarbonCommonConstants.CARBON_HEAP_MEMORY_POOLING_THRESHOLD_BYTES,
+              CarbonCommonConstants.CARBON_HEAP_MEMORY_POOLING_THRESHOLD_BYTES_DEFAULT));
+    } catch (NumberFormatException exc) {
+      LOGGER.error(
+          "The heap memory pooling threshold bytes is invalid. Using the default value "
+              + CarbonCommonConstants.CARBON_HEAP_MEMORY_POOLING_THRESHOLD_BYTES_DEFAULT);
+      thresholdSize = Integer.parseInt(
+          CarbonCommonConstants.CARBON_HEAP_MEMORY_POOLING_THRESHOLD_BYTES_DEFAULT);
+    }
+    return thresholdSize;
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/carbondata/blob/55fe349d/core/src/test/java/org/apache/carbondata/core/memory/MemoryAllocatorUnitTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/carbondata/core/memory/MemoryAllocatorUnitTest.java b/core/src/test/java/org/apache/carbondata/core/memory/MemoryAllocatorUnitTest.java
new file mode 100644
index 0000000..df1e103
--- /dev/null
+++ b/core/src/test/java/org/apache/carbondata/core/memory/MemoryAllocatorUnitTest.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.core.memory;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.carbondata.core.constants.CarbonCommonConstants;
+import org.apache.carbondata.core.util.CarbonProperties;
+
+public class MemoryAllocatorUnitTest {
+
+  @Test
+  public void testHeapMemoryReuse() {
+    MemoryAllocator heapMem = new HeapMemoryAllocator();
+    // The size is less than 1024 * 1024,
+    // allocate new memory every time.
+    MemoryBlock onheap1 = heapMem.allocate(513);
+    Object obj1 = onheap1.getBaseObject();
+    heapMem.free(onheap1);
+    MemoryBlock onheap2 = heapMem.allocate(514);
+    Assert.assertNotEquals(obj1, onheap2.getBaseObject());
+
+    // The size is greater than 1024 * 1024,
+    // reuse the previous memory which has released.
+    MemoryBlock onheap3 = heapMem.allocate(1024 * 1024 + 1);
+    Assert.assertEquals(onheap3.size(), 1024 * 1024 + 1);
+    Object obj3 = onheap3.getBaseObject();
+    heapMem.free(onheap3);
+    MemoryBlock onheap4 = heapMem.allocate(1024 * 1024 + 7);
+    Assert.assertEquals(onheap4.size(), 1024 * 1024 + 7);
+    Assert.assertEquals(obj3, onheap4.getBaseObject());
+  }
+
+  @Test
+  public void testHeapMemoryNotPool() {
+    // not pool
+    CarbonProperties.getInstance()
+        .addProperty(CarbonCommonConstants.CARBON_HEAP_MEMORY_POOLING_THRESHOLD_BYTES, "-1");
+
+    MemoryAllocator heapMem = new HeapMemoryAllocator();
+    MemoryBlock onheap1 = heapMem.allocate(513);
+    Object obj1 = onheap1.getBaseObject();
+    heapMem.free(onheap1);
+    MemoryBlock onheap2 = heapMem.allocate(514);
+    Assert.assertNotEquals(obj1, onheap2.getBaseObject());
+
+    MemoryBlock onheap3 = heapMem.allocate(1024 * 1024 + 1);
+    Assert.assertEquals(onheap3.size(), 1024 * 1024 + 1);
+    Object obj3 = onheap3.getBaseObject();
+    heapMem.free(onheap3);
+    MemoryBlock onheap4 = heapMem.allocate(1024 * 1024 + 7);
+    Assert.assertEquals(onheap4.size(), 1024 * 1024 + 7);
+    Assert.assertNotEquals(obj3, onheap4.getBaseObject());
+  }
+}

[03/25] carbondata git commit: [Documentation] Formatting issue fixed

Posted by ra...@apache.org.

[Documentation] Formatting issue fixed

This closes #1954


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/fd481f58
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/fd481f58
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/fd481f58

Branch: refs/heads/branch-1.3
Commit: fd481f58b2411f72472d47946556f1ecbc4caed5
Parents: ff2a213
Author: Jatin <ja...@knoldus.in>
Authored: Thu Feb 8 16:25:14 2018 +0530
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 17:39:54 2018 +0530

----------------------------------------------------------------------
 docs/data-management-on-carbondata.md | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/fd481f58/docs/data-management-on-carbondata.md
----------------------------------------------------------------------
diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md
index c846ffc..fba2916 100644
--- a/docs/data-management-on-carbondata.md
+++ b/docs/data-management-on-carbondata.md
@@ -628,8 +628,7 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
   ```
   LOAD DATA [LOCAL] INPATH 'folder_path' 
   INTO TABLE [db_name.]table_name PARTITION (partition_spec) 
-  OPTIONS(property_name=property_value, ...)
-    
+  OPTIONS(property_name=property_value, ...)    
   INSERT INTO INTO TABLE [db_name.]table_name PARTITION (partition_spec) <SELECT STATMENT>
   ```
   
@@ -637,8 +636,7 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
   ```
   LOAD DATA LOCAL INPATH '${env:HOME}/staticinput.csv'
   INTO TABLE locationTable
-  PARTITION (country = 'US', state = 'CA')
-    
+  PARTITION (country = 'US', state = 'CA')  
   INSERT INTO TABLE locationTable
   PARTITION (country = 'US', state = 'AL')
   SELECT <columns list excluding partition columns> FROM another_user
@@ -651,8 +649,7 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
   Example:
   ```
   LOAD DATA LOCAL INPATH '${env:HOME}/staticinput.csv'
-  INTO TABLE locationTable
-          
+  INTO TABLE locationTable          
   INSERT INTO TABLE locationTable
   SELECT <columns list excluding partition columns> FROM another_user
   ```

[11/25] carbondata git commit: [CARBONDATA-2098] Optimize document for datamap

Posted by ra...@apache.org.

[CARBONDATA-2098] Optimize document for datamap

Optimize document for datamap

This closes #2025


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/95ac5eff
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/95ac5eff
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/95ac5eff

Branch: refs/heads/branch-1.3
Commit: 95ac5eff1673c4c595352e7ec6443f49145b2af7
Parents: 877172c
Author: Jacky Li <ja...@qq.com>
Authored: Sat Mar 3 11:34:46 2018 +0800
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 17:48:45 2018 +0530

----------------------------------------------------------------------
 docs/datamap/preaggregate-datamap-guide.md | 213 ++++++++++++++++++++++++
 docs/datamap/timeseries-datamap-guide.md   | 135 +++++++++++++++
 2 files changed, 348 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/95ac5eff/docs/datamap/preaggregate-datamap-guide.md
----------------------------------------------------------------------
diff --git a/docs/datamap/preaggregate-datamap-guide.md b/docs/datamap/preaggregate-datamap-guide.md
new file mode 100644
index 0000000..fabfd7d
--- /dev/null
+++ b/docs/datamap/preaggregate-datamap-guide.md
@@ -0,0 +1,213 @@
+# CarbonData Pre-aggregate DataMap
+  
+## Quick example
+Download and unzip spark-2.2.0-bin-hadoop2.7.tgz, and export $SPARK_HOME
+
+Package carbon jar, and copy assembly/target/scala-2.11/carbondata_2.11-x.x.x-SNAPSHOT-shade-hadoop2.7.2.jar to $SPARK_HOME/jars
+```shell
+mvn clean package -DskipTests -Pspark-2.2
+```
+
+Start spark-shell in new terminal, type :paste, then copy and run the following code.
+```scala
+ import java.io.File
+ import org.apache.spark.sql.{CarbonEnv, SparkSession}
+ import org.apache.spark.sql.CarbonSession._
+ import org.apache.spark.sql.streaming.{ProcessingTime, StreamingQuery}
+ import org.apache.carbondata.core.util.path.CarbonStorePath
+ 
+ val warehouse = new File("./warehouse").getCanonicalPath
+ val metastore = new File("./metastore").getCanonicalPath
+ 
+ val spark = SparkSession
+   .builder()
+   .master("local")
+   .appName("preAggregateExample")
+   .config("spark.sql.warehouse.dir", warehouse)
+   .getOrCreateCarbonSession(warehouse, metastore)
+
+ spark.sparkContext.setLogLevel("ERROR")
+
+ // drop table if exists previously
+ spark.sql(s"DROP TABLE IF EXISTS sales")
+ 
+ // Create main table
+ spark.sql(
+   s"""
+      | CREATE TABLE sales (
+      | user_id string,
+      | country string,
+      | quantity int,
+      | price bigint)
+      | STORED BY 'carbondata'
+    """.stripMargin)
+ 
+ // Create pre-aggregate table on the main table
+ // If main table already have data, following command 
+ // will trigger one immediate load to the pre-aggregate table
+ spark.sql(
+   s"""
+      | CREATE DATAMAP agg_sales
+      | ON TABLE sales
+      | USING "preaggregate"
+      | AS
+      | SELECT country, sum(quantity), avg(price)
+      | FROM sales
+      | GROUP BY country
+    """.stripMargin)
+      
+  import spark.implicits._
+  import org.apache.spark.sql.SaveMode
+  import scala.util.Random
+ 
+  // Load data to the main table, it will also
+  // trigger immediate load to pre-aggregate table.
+  // These two loading operation is carried out in a
+  // transactional manner, meaning that the whole 
+  // operation will fail if one of the loading fails
+  val r = new Random()
+  spark.sparkContext.parallelize(1 to 10)
+   .map(x => ("ID." + r.nextInt(100000), "country" + x % 8, x % 50, x % 60))
+   .toDF("user_id", "country", "quantity", "price")
+   .write
+   .format("carbondata")
+   .option("tableName", "sales")
+   .option("compress", "true")
+   .mode(SaveMode.Append)
+   .save()
+      
+  spark.sql(
+    s"""
+       |SELECT country, sum(quantity), avg(price)
+       | from sales GROUP BY country
+     """.stripMargin).show
+
+  spark.stop
+```
+
+##PRE-AGGREGATE DataMap  
+  Pre-aggregate tables are created as DataMaps and managed as tables internally by CarbonData. 
+  User can create as many pre-aggregate datamaps required to improve query performance, 
+  provided the storage requirements and loading speeds are acceptable.
+  
+  Once pre-aggregate datamaps are created, CarbonData's SparkSQL optimizer extension supports to 
+  select the most efficient pre-aggregate datamap and rewrite the SQL to query against the selected 
+  datamap instead of the main table. Since the data size of pre-aggregate datamap is smaller, 
+  user queries are much faster. In our previous experience, we have seen 5X to 100X times faster 
+  in production SQLs.
+    
+  For instance, main table called **sales** which is defined as 
+  
+  ```
+  CREATE TABLE sales (
+    order_time timestamp,
+    user_id string,
+    sex string,
+    country string,
+    quantity int,
+    price bigint)
+  STORED BY 'carbondata'
+  ```
+  
+  User can create pre-aggregate tables using the Create DataMap DDL
+  
+  ```
+  CREATE DATAMAP agg_sales
+  ON TABLE sales
+  USING "preaggregate"
+  AS
+    SELECT country, sex, sum(quantity), avg(price)
+    FROM sales
+    GROUP BY country, sex
+  ```
+  
+#### Functions supported in pre-aggregate table
+
+| Function | Rollup supported |
+|----------|:----------------:|
+| SUM      |Yes               |
+| AVG      |Yes               |
+| MAX      |Yes               |
+| MIN      |Yes               |
+| COUNT    |Yes               |
+
+
+#### How pre-aggregate tables are selected
+When a user query is submitted, during query planning phase, CarbonData will collect all matched 
+pre-aggregate tables as candidates according to Relational Algebra transformation rules. Then, the 
+best pre-aggregate table for this query will be selected among the candidates based on cost. 
+For simplicity, current cost estimation is based on the data size of the pre-aggregate table. (We 
+assume that query will be faster on smaller table)
+
+For the main table **sales** and pre-aggregate table **agg_sales** created above, following queries 
+```
+SELECT country, sex, sum(quantity), avg(price) from sales GROUP BY country, sex
+
+SELECT sex, sum(quantity) from sales GROUP BY sex
+
+SELECT sum(price), country from sales GROUP BY country
+``` 
+
+will be transformed by CarbonData's query planner to query against pre-aggregate table 
+**agg_sales** instead of the main table **sales**
+
+However, for following queries
+```
+SELECT user_id, country, sex, sum(quantity), avg(price) from sales GROUP BY user_id, country, sex
+
+SELECT sex, avg(quantity) from sales GROUP BY sex
+
+SELECT country, max(price) from sales GROUP BY country
+```
+
+will query against main table **sales** only, because it does not satisfy pre-aggregate table 
+selection logic. 
+
+#### Loading data to pre-aggregate tables
+For existing table with loaded data, data load to pre-aggregate table will be triggered by the 
+CREATE DATAMAP statement when user creates the pre-aggregate table. For incremental loads after 
+aggregates tables are created, loading data to main table triggers the load to pre-aggregate tables 
+once main table loading is complete. 
+
+These loads are transactional 
+meaning that data on main table and pre-aggregate tables are only visible to the user after all 
+tables are loaded successfully, if one of these loads fails, new data are not visible in all tables 
+as if the load operation is not happened.   
+
+#### Querying data from pre-aggregate tables
+As a technique for query acceleration, Pre-aggregate tables cannot be queries directly. 
+Queries are to be made on main table. While doing query planning, internally CarbonData will check 
+associated pre-aggregate tables with the main table, and do query plan transformation accordingly. 
+
+User can verify whether a query can leverage pre-aggregate table or not by executing `EXPLAIN`
+command, which will show the transformed logical plan, and thus user can check whether pre-aggregate
+table is selected.
+
+#### Compacting pre-aggregate tables
+Running Compaction command (`ALTER TABLE COMPACT`) on main table will **not automatically** 
+compact the pre-aggregate tables created on the main table. User need to run Compaction command 
+separately on each pre-aggregate table to compact them.
+
+Compaction is an optional operation for pre-aggregate table. If compaction is performed on
+main table but not performed on pre-aggregate table, all queries still can benefit from 
+pre-aggregate tables. To further improve the query performance, compaction on pre-aggregate tables 
+can be triggered to merge the segments and files in the pre-aggregate tables. 
+
+#### Data Management on pre-aggregate tables
+Once there is pre-aggregate table created on the main table, following command on the main table
+is not supported:
+1. Data management command: `UPDATE/DELETE/DELETE SEGMENT`. 
+2. Schema management command: `ALTER TABLE DROP COLUMN`, `ALTER TABLE CHANGE DATATYPE`, 
+`ALTER TABLE RENAME`. Note that adding a new column is supported, and for dropping columns and 
+change datatype command, CarbonData will check whether it will impact the pre-aggregate table, if 
+ not, the operation is allowed, otherwise operation will be rejected by throwing exception.   
+3. Partition management command: `ALTER TABLE ADD/DROP PARTITION`
+
+However, there is still way to support these operations on main table, in current CarbonData 
+release, user can do as following:
+1. Remove the pre-aggregate table by `DROP DATAMAP` command
+2. Carry out the data management operation on main table
+3. Create the pre-aggregate table again by `CREATE DATAMAP` command
+Basically, user can manually trigger the operation by re-building the datamap.
+
+

http://git-wip-us.apache.org/repos/asf/carbondata/blob/95ac5eff/docs/datamap/timeseries-datamap-guide.md
----------------------------------------------------------------------
diff --git a/docs/datamap/timeseries-datamap-guide.md b/docs/datamap/timeseries-datamap-guide.md
new file mode 100644
index 0000000..ecd7234
--- /dev/null
+++ b/docs/datamap/timeseries-datamap-guide.md
@@ -0,0 +1,135 @@
+# CarbonData Timeseries DataMap
+
+## Supporting timeseries data (Alpha feature in 1.3.0)
+Timeseries DataMap a pre-aggregate table implementation based on 'preaggregate' DataMap. 
+Difference is that Timerseries DataMap has built-in understanding of time hierarchy and 
+levels: year, month, day, hour, minute, so that it supports automatic roll-up in time dimension 
+for query.
+  
+For instance, user can create multiple timeseries datamap on the main table which has a *event_time*
+column, one datamap for one time granularity. Then Carbondata can do automatic roll-up for queries 
+on the main table.
+
+```
+CREATE DATAMAP agg_year
+ON TABLE sales
+USING "timeseries"
+DMPROPERTIES (
+  'event_time'='order_time',
+  'year_granualrity'='1',
+) AS
+SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+ avg(price) FROM sales GROUP BY order_time, country, sex
+  
+CREATE DATAMAP agg_month
+ON TABLE sales
+USING "timeseries"
+DMPROPERTIES (
+  'event_time'='order_time',
+  'month_granualrity'='1',
+) AS
+SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+ avg(price) FROM sales GROUP BY order_time, country, sex
+  
+CREATE DATAMAP agg_day
+ON TABLE sales
+USING "timeseries"
+DMPROPERTIES (
+  'event_time'='order_time',
+  'day_granualrity'='1',
+) AS
+SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+ avg(price) FROM sales GROUP BY order_time, country, sex
+      
+CREATE DATAMAP agg_sales_hour
+ON TABLE sales
+USING "timeseries"
+DMPROPERTIES (
+  'event_time'='order_time',
+  'hour_granualrity'='1',
+) AS
+SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+ avg(price) FROM sales GROUP BY order_time, country, sex
+
+CREATE DATAMAP agg_minute
+ON TABLE sales
+USING "timeseries"
+DMPROPERTIES (
+  'event_time'='order_time',
+  'minute_granualrity'='1',
+) AS
+SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+ avg(price) FROM sales GROUP BY order_time, country, sex
+  
+CREATE DATAMAP agg_minute
+ON TABLE sales
+USING "timeseries"
+DMPROPERTIES (
+  'event_time'='order_time',
+  'minute_granualrity'='1',
+) AS
+SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+ avg(price) FROM sales GROUP BY order_time, country, sex
+```
+  
+For querying timeseries data, Carbondata has builtin support for following time related UDF 
+to enable automatically roll-up to the desired aggregation level
+```
+timeseries(timeseries column name, 'aggregation level')
+```
+```
+SELECT timeseries(order_time, 'hour'), sum(quantity) FROM sales GROUP BY timeseries(order_time,
+'hour')
+```
+  
+It is **not necessary** to create pre-aggregate tables for each granularity unless required for 
+query. Carbondata can roll-up the data and fetch it.
+ 
+For Example: For main table **sales** , if following timeseries datamaps were created for day 
+level and hour level pre-aggregate
+  
+```
+  CREATE DATAMAP agg_day
+  ON TABLE sales
+  USING "timeseries"
+  DMPROPERTIES (
+    'event_time'='order_time',
+    'day_granualrity'='1',
+  ) AS
+  SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+   avg(price) FROM sales GROUP BY order_time, country, sex
+        
+  CREATE DATAMAP agg_sales_hour
+  ON TABLE sales
+  USING "timeseries"
+  DMPROPERTIES (
+    'event_time'='order_time',
+    'hour_granualrity'='1',
+  ) AS
+  SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+   avg(price) FROM sales GROUP BY order_time, country, sex
+```
+
+Queries like below will be rolled-up and hit the timeseries datamaps
+```
+Select timeseries(order_time, 'month'), sum(quantity) from sales group by timeseries(order_time,
+  'month')
+  
+Select timeseries(order_time, 'year'), sum(quantity) from sales group by timeseries(order_time,
+  'year')
+```
+
+NOTE (<b>RESTRICTION</b>):
+* Only value of 1 is supported for hierarchy levels. Other hierarchy levels will be supported in
+the future CarbonData release. 
+* timeseries datamap for the desired levels needs to be created one after the other
+* timeseries datamaps created for each level needs to be dropped separately 
+      
+
+#### Compacting timeseries datamp
+Refer to Compaction section in [preaggregation datamap](https://github.com/apache/carbondata/blob/master/docs/datamap/preaggregate-datamap-guide.md). 
+Same applies to timeseries datamap.
+
+#### Data Management on timeseries datamap
+Refer to Data Management section in [preaggregation datamap](https://github.com/apache/carbondata/blob/master/docs/datamap/preaggregate-datamap-guide.md).
+Same applies to timeseries datamap.
\ No newline at end of file

[21/25] carbondata git commit: [CARBONDATA1506] fix SDV error in PushUP_FILTER_uniqdata_TC075

Posted by ra...@apache.org.

[CARBONDATA1506] fix SDV error in PushUP_FILTER_uniqdata_TC075

fix SDV error in PushUP_FILTER_uniqdata_TC075

This closes #1941


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/f1a73bd9
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/f1a73bd9
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/f1a73bd9

Branch: refs/heads/branch-1.3
Commit: f1a73bd924498efd62e044124ddb240640faa152
Parents: 65234b2
Author: xubo245 <60...@qq.com>
Authored: Wed Feb 7 16:34:53 2018 +0800
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 18:05:19 2018 +0530

----------------------------------------------------------------------
 .../carbondata/cluster/sdv/generated/QueriesBasicTestCase.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/f1a73bd9/integration/spark-common-cluster-test/src/test/scala/org/apache/carbondata/cluster/sdv/generated/QueriesBasicTestCase.scala
----------------------------------------------------------------------
diff --git a/integration/spark-common-cluster-test/src/test/scala/org/apache/carbondata/cluster/sdv/generated/QueriesBasicTestCase.scala b/integration/spark-common-cluster-test/src/test/scala/org/apache/carbondata/cluster/sdv/generated/QueriesBasicTestCase.scala
index b663eb4..eee4f7f 100644
--- a/integration/spark-common-cluster-test/src/test/scala/org/apache/carbondata/cluster/sdv/generated/QueriesBasicTestCase.scala
+++ b/integration/spark-common-cluster-test/src/test/scala/org/apache/carbondata/cluster/sdv/generated/QueriesBasicTestCase.scala
@@ -4239,8 +4239,8 @@ class QueriesBasicTestCase extends QueryTest with BeforeAndAfterAll {
   //PushUP_FILTER_uniqdata_TC075
   test("PushUP_FILTER_uniqdata_TC075", Include) {
 
-    checkAnswer(s"""select covar_samp(1,2) from uniqdata where CUST_ID IS NULL or DOB IS NOT NULL or BIGINT_COLUMN1 =1233720368578 or DECIMAL_COLUMN1 = 12345678901.1234000058 or Double_COLUMN1 = 1.12345674897976E10 or INTEGER_COLUMN1 IS NULL """,
-      s"""select covar_samp(1,2) from uniqdata_hive where CUST_ID IS NULL or DOB IS NOT NULL or BIGINT_COLUMN1 =1233720368578 or DECIMAL_COLUMN1 = 12345678901.1234000058 or Double_COLUMN1 = 1.12345674897976E10 or INTEGER_COLUMN1 IS NULL """, "QueriesBasicTestCase_PushUP_FILTER_uniqdata_TC075")
+    checkAnswer(s"""select round(covar_samp(1,2), 4) from uniqdata where CUST_ID IS NULL or DOB IS NOT NULL or BIGINT_COLUMN1 =1233720368578 or DECIMAL_COLUMN1 = 12345678901.1234000058 or Double_COLUMN1 = 1.12345674897976E10 or INTEGER_COLUMN1 IS NULL """,
+      s"""select round(covar_samp(1,2), 4) from uniqdata_hive where CUST_ID IS NULL or DOB IS NOT NULL or BIGINT_COLUMN1 =1233720368578 or DECIMAL_COLUMN1 = 12345678901.1234000058 or Double_COLUMN1 = 1.12345674897976E10 or INTEGER_COLUMN1 IS NULL """, "QueriesBasicTestCase_PushUP_FILTER_uniqdata_TC075")
 
   }

[08/25] carbondata git commit: [CARBONDATA-2214][Docs]Remove config 'spark.sql.hive.thriftServer.singleSession' from installation-guide.md

Posted by ra...@apache.org.

[CARBONDATA-2214][Docs]Remove config 'spark.sql.hive.thriftServer.singleSession' from installation-guide.md

Remove config 'spark.sql.hive.thriftServer.singleSession' from installation-guide.md

This closes #2013


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/28c3701c
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/28c3701c
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/28c3701c

Branch: refs/heads/branch-1.3
Commit: 28c3701c21260b1887411e0ea6c87f96be060c81
Parents: 433bdf3
Author: Zhang Zhichao <44...@qq.com>
Authored: Wed Feb 28 16:11:00 2018 +0800
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 17:47:32 2018 +0530

----------------------------------------------------------------------
 docs/installation-guide.md | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/28c3701c/docs/installation-guide.md
----------------------------------------------------------------------
diff --git a/docs/installation-guide.md b/docs/installation-guide.md
index 1ba5dd1..0c8790b 100644
--- a/docs/installation-guide.md
+++ b/docs/installation-guide.md
@@ -141,7 +141,6 @@ mv carbondata.tar.gz carbonlib/
 
 ```
 ./bin/spark-submit
---conf spark.sql.hive.thriftServer.singleSession=true
 --class org.apache.carbondata.spark.thriftserver.CarbonThriftServer
 $SPARK_HOME/carbonlib/$CARBON_ASSEMBLY_JAR <carbon_store_path>
 ```
@@ -151,13 +150,23 @@ $SPARK_HOME/carbonlib/$CARBON_ASSEMBLY_JAR <carbon_store_path>
 | CARBON_ASSEMBLY_JAR | CarbonData assembly jar name present in the `$SPARK_HOME/carbonlib/` folder. | carbondata_2.xx-x.x.x-SNAPSHOT-shade-hadoop2.7.2.jar |
 | carbon_store_path | This is a parameter to the CarbonThriftServer class. This a HDFS path where CarbonData files will be kept. Strongly Recommended to put same as carbon.storelocation parameter of carbon.properties. | `hdfs://<host_name>:port/user/hive/warehouse/carbon.store` |
 
+**NOTE**: From Spark 1.6, by default the Thrift server runs in multi-session mode. Which means each JDBC/ODBC connection owns a copy of their own SQL configuration and temporary function registry. Cached tables are still shared though. If you prefer to run the Thrift server in single-session mode and share all SQL configuration and temporary function registry, please set option `spark.sql.hive.thriftServer.singleSession` to `true`. You may either add this option to `spark-defaults.conf`, or pass it to `spark-submit.sh` via `--conf`:
+
+```
+./bin/spark-submit
+--conf spark.sql.hive.thriftServer.singleSession=true
+--class org.apache.carbondata.spark.thriftserver.CarbonThriftServer
+$SPARK_HOME/carbonlib/$CARBON_ASSEMBLY_JAR <carbon_store_path>
+```
+
+**But** in single-session mode, if one user changes the database from one connection, the database of the other connections will be changed too.
+
 **Examples**
    
    * Start with default memory and executors.
 
 ```
 ./bin/spark-submit
---conf spark.sql.hive.thriftServer.singleSession=true
 --class org.apache.carbondata.spark.thriftserver.CarbonThriftServer 
 $SPARK_HOME/carbonlib
 /carbondata_2.xx-x.x.x-SNAPSHOT-shade-hadoop2.7.2.jar
@@ -167,7 +176,7 @@ hdfs://<host_name>:port/user/hive/warehouse/carbon.store
    * Start with Fixed executors and resources.
 
 ```
-./bin/spark-submit --conf spark.sql.hive.thriftServer.singleSession=true 
+./bin/spark-submit
 --class org.apache.carbondata.spark.thriftserver.CarbonThriftServer 
 --num-executors 3 --driver-memory 20g --executor-memory 250g 
 --executor-cores 32

[24/25] carbondata git commit: [CARBONDATA-2098] Optimize pre-aggregate documentation

Posted by ra...@apache.org.

[CARBONDATA-2098] Optimize pre-aggregate documentation

optimize pre-aggregate documentation
move to separate file
add more examples

This closes #2022


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/ec893412
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/ec893412
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/ec893412

Branch: refs/heads/branch-1.3
Commit: ec893412bc40b72a642a0173f5f5b3fdcbba0877
Parents: a816e0c
Author: sraghunandan <ca...@gmail.com>
Authored: Fri Mar 2 17:02:39 2018 +0530
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 18:11:51 2018 +0530

----------------------------------------------------------------------
 docs/data-management-on-carbondata.md           | 242 --------------
 docs/preaggregate-guide.md                      | 313 +++++++++++++++++++
 .../examples/PreAggregateTableExample.scala     |  50 ++-
 3 files changed, 362 insertions(+), 243 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/ec893412/docs/data-management-on-carbondata.md
----------------------------------------------------------------------
diff --git a/docs/data-management-on-carbondata.md b/docs/data-management-on-carbondata.md
index ea80d41..2aa4a49 100644
--- a/docs/data-management-on-carbondata.md
+++ b/docs/data-management-on-carbondata.md
@@ -26,7 +26,6 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
 * [UPDATE AND DELETE](#update-and-delete)
 * [COMPACTION](#compaction)
 * [PARTITION](#partition)
-* [PRE-AGGREGATE TABLES](#pre-aggregate-tables)
 * [BUCKETING](#bucketing)
 * [SEGMENT MANAGEMENT](#segment-management)
 
@@ -859,247 +858,6 @@ This tutorial is going to introduce all commands and data operations on CarbonDa
   * The partitioned column can be excluded from SORT_COLUMNS, this will let other columns to do the efficient sorting.
   * When writing SQL on a partition table, try to use filters on the partition column.
 
-
-## PRE-AGGREGATE TABLES
-  CarbonData supports pre aggregating of data so that OLAP kind of queries can fetch data 
-  much faster. Aggregate tables are created as datamaps so that the handling is as efficient as 
-  other indexing support. Users can create as many aggregate tables they require as datamaps to 
-  improve their query performance, provided the storage requirements and loading speeds are 
-  acceptable.
-  
-  For main table called **sales** which is defined as 
-  
-  ```
-  CREATE TABLE sales (
-            order_time timestamp,
-            user_id STRING,
-            sex STRING,
-            country STRING,
-            quantity INT,
-            price BIGINT)
-  STORED BY 'carbondata'
-  ```
-  
-  user can create pre-aggregate tables using the DDL
-  
-  ```
-  CREATE DATAMAP agg_sales
-  ON TABLE sales
-  USING "preaggregate"
-  AS
-  SELECT country, sex, sum(quantity), avg(price)
-  FROM sales
-  GROUP BY country, sex
-  ```
-  
-<b><p align="left">Functions supported in pre-aggregate tables</p></b>
-
-| Function | Rollup supported |
-|-----------|----------------|
-| SUM | Yes |
-| AVG | Yes |
-| MAX | Yes |
-| MIN | Yes |
-| COUNT | Yes |
-
-
-##### How pre-aggregate tables are selected
-For the main table **sales** and pre-aggregate table **agg_sales** created above, queries of the 
-kind
-```
-SELECT country, sex, sum(quantity), avg(price) from sales GROUP BY country, sex
-
-SELECT sex, sum(quantity) from sales GROUP BY sex
-
-SELECT sum(price), country from sales GROUP BY country
-``` 
-
-will be transformed by Query Planner to fetch data from pre-aggregate table **agg_sales**
-
-But queries of kind
-```
-SELECT user_id, country, sex, sum(quantity), avg(price) from sales GROUP BY user_id, country, sex
-
-SELECT sex, avg(quantity) from sales GROUP BY sex
-
-SELECT country, max(price) from sales GROUP BY country
-```
-
-will fetch the data from the main table **sales**
-
-##### Loading data to pre-aggregate tables
-For existing table with loaded data, data load to pre-aggregate table will be triggered by the 
-CREATE DATAMAP statement when user creates the pre-aggregate table.
-For incremental loads after aggregates tables are created, loading data to main table triggers 
-the load to pre-aggregate tables once main table loading is complete. These loads are automic 
-meaning that data on main table and aggregate tables are only visible to the user after all tables 
-are loaded
-
-##### Querying data from pre-aggregate tables
-Pre-aggregate tables cannot be queries directly. Queries are to be made on main table. Internally 
-carbondata will check associated pre-aggregate tables with the main table, and if the 
-pre-aggregate tables satisfy the query condition, the plan is transformed automatically to use 
-pre-aggregate table to fetch the data.
-
-##### Compacting pre-aggregate tables
-Compaction command (ALTER TABLE COMPACT) need to be run separately on each pre-aggregate table.
-Running Compaction command on main table will **not automatically** compact the pre-aggregate 
-tables. Compaction is an optional operation for pre-aggregate table. If compaction is performed on
-main table but not performed on pre-aggregate table, all queries still can benefit from 
-pre-aggregate tables. To further improve performance on pre-aggregate tables, compaction can be 
-triggered on pre-aggregate tables directly, it will merge the segments inside pre-aggregate table. 
-
-##### Update/Delete Operations on pre-aggregate tables
-This functionality is not supported.
-
-  NOTE (<b>RESTRICTION</b>):
-  Update/Delete operations are <b>not supported</b> on main table which has pre-aggregate tables 
-  created on it. All the pre-aggregate tables <b>will have to be dropped</b> before update/delete 
-  operations can be performed on the main table. Pre-aggregate tables can be rebuilt manually 
-  after update/delete operations are completed
- 
-##### Delete Segment Operations on pre-aggregate tables
-This functionality is not supported.
-
-  NOTE (<b>RESTRICTION</b>):
-  Delete Segment operations are <b>not supported</b> on main table which has pre-aggregate tables 
-  created on it. All the pre-aggregate tables <b>will have to be dropped</b> before delete segment 
-  operations can be performed on the main table. Pre-aggregate tables can be rebuilt manually 
-  after delete segment operations are completed
-  
-##### Alter Table Operations on pre-aggregate tables
-This functionality is not supported.
-
-  NOTE (<b>RESTRICTION</b>):
-  Adding new column in new table does not have any affect on pre-aggregate tables. However if 
-  dropping or renaming a column has impact in pre-aggregate table, such operations will be 
-  rejected and error will be thrown. All the pre-aggregate tables <b>will have to be dropped</b> 
-  before alter operations can be performed on the main table. Pre-aggregate tables can be rebuilt 
-  manually after alter table operations are completed
-  
-### Supporting timeseries data (Alpha feature in 1.3.0)
-CarbonData has built-in understanding of time hierarchy and levels: year, month, day, hour, minute, second.
-Timeseries pre-aggregate tables can be created for the hierarchy and CarbonData can do automatic 
-roll-up for the queries on these hierarchies.
-
-  ```
-  CREATE DATAMAP agg_year
-  ON TABLE sales
-  USING "timeseries"
-  DMPROPERTIES (
-  'event_time'='order_time',
-  'year_granualrity'='1',
-  ) AS
-  SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
-   avg(price) FROM sales GROUP BY order_time, country, sex
-    
-  CREATE DATAMAP agg_month
-  ON TABLE sales
-  USING "timeseries"
-  DMPROPERTIES (
-  'event_time'='order_time',
-  'month_granualrity'='1',
-  ) AS
-  SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
-   avg(price) FROM sales GROUP BY order_time, country, sex
-    
-  CREATE DATAMAP agg_day
-  ON TABLE sales
-  USING "timeseries"
-  DMPROPERTIES (
-  'event_time'='order_time',
-  'day_granualrity'='1',
-  ) AS
-  SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
-   avg(price) FROM sales GROUP BY order_time, country, sex
-        
-  CREATE DATAMAP agg_sales_hour
-  ON TABLE sales
-  USING "timeseries"
-  DMPROPERTIES (
-  'event_time'='order_time',
-  'hour_granualrity'='1',
-  ) AS
-  SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
-   avg(price) FROM sales GROUP BY order_time, country, sex
-  
-  CREATE DATAMAP agg_minute
-  ON TABLE sales
-  USING "timeseries"
-  DMPROPERTIES (
-  'event_time'='order_time',
-  'minute_granualrity'='1',
-  ) AS
-  SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
-   avg(price) FROM sales GROUP BY order_time, country, sex
-    
-  CREATE DATAMAP agg_minute
-  ON TABLE sales
-  USING "timeseries"
-  DMPROPERTIES (
-  'event_time'='order_time',
-  'minute_granualrity'='1',
-  ) AS
-  SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
-   avg(price) FROM sales GROUP BY order_time, country, sex
-  ```
-  For Querying data and automatically roll-up to the desired aggregation level, CarbonData supports 
-  UDF as
-  ```
-  timeseries(timeseries column name, 'aggregation level')
-  ```
-  Examples
-  ```
-  SELECT 
-        timeseries(order_time, 'hour'), 
-        sum(quantity) 
-  FROM sales 
-  GROUP BY timeseries(order_time, 'hour')
-  ```
-  
-  It is **not necessary** to create pre-aggregate tables for each granularity unless required for 
-  query. CarbonData can roll-up the data and fetch it.
-   
-  For Example: For main table **sales**, If timeseries pre-aggregate tables were created as  
-  
-  ```
-  CREATE DATAMAP agg_day
-    ON TABLE sales
-    USING "timeseries"
-    DMPROPERTIES (
-    'event_time'='order_time',
-    'day_granualrity'='1',
-    ) AS
-    SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
-     avg(price) FROM sales GROUP BY order_time, country, sex
-          
-    CREATE DATAMAP agg_sales_hour
-    ON TABLE sales
-    USING "timeseries"
-    DMPROPERTIES (
-    'event_time'='order_time',
-    'hour_granualrity'='1',
-    ) AS
-    SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
-     avg(price) FROM sales GROUP BY order_time, country, sex
-  ```
-  
-  Queries like below will be rolled-up and fetched from pre-aggregate tables
-  ```
-  Select timeseries(order_time, 'month'), sum(quantity) from sales group by timeseries(order_time,
-    'month')
-    
-  Select timeseries(order_time, 'year'), sum(quantity) from sales group by timeseries(order_time,
-    'year')
-  ```
-  
-  NOTE (<b>RESTRICTION</b>):
-  * Only 1 is supported for granularity value of timeseries pre-aggregate table. Other granularity value are not supported.
-  * Only one granularity can be defined on creating one timeseries pre-aggregate table. Other granularity are created separately.
-  * Pre-aggregate tables for the desired levels needs to be created one after the other
-  * Pre-aggregate tables are created for each level needs to be dropped separately 
-    
-
 ## BUCKETING
 
   Bucketing feature can be used to distribute/organize the table/partition data into multiple files such

http://git-wip-us.apache.org/repos/asf/carbondata/blob/ec893412/docs/preaggregate-guide.md
----------------------------------------------------------------------
diff --git a/docs/preaggregate-guide.md b/docs/preaggregate-guide.md
new file mode 100644
index 0000000..411433a
--- /dev/null
+++ b/docs/preaggregate-guide.md
@@ -0,0 +1,313 @@
+# CarbonData Pre-aggregate tables
+  
+## Quick example
+Download and unzip spark-2.2.0-bin-hadoop2.7.tgz, and export $SPARK_HOME
+
+Package carbon jar, and copy assembly/target/scala-2.11/carbondata_2.11-x.x.x-SNAPSHOT-shade-hadoop2.7.2.jar to $SPARK_HOME/jars
+```shell
+mvn clean package -DskipTests -Pspark-2.2
+```
+
+Start spark-shell in new terminal, type :paste, then copy and run the following code.
+```scala
+ import java.io.File
+ import org.apache.spark.sql.{CarbonEnv, SparkSession}
+ import org.apache.spark.sql.CarbonSession._
+ import org.apache.spark.sql.streaming.{ProcessingTime, StreamingQuery}
+ import org.apache.carbondata.core.util.path.CarbonStorePath
+ 
+ val warehouse = new File("./warehouse").getCanonicalPath
+ val metastore = new File("./metastore").getCanonicalPath
+ 
+ val spark = SparkSession
+   .builder()
+   .master("local")
+   .appName("preAggregateExample")
+   .config("spark.sql.warehouse.dir", warehouse)
+   .getOrCreateCarbonSession(warehouse, metastore)
+
+ spark.sparkContext.setLogLevel("ERROR")
+
+ // drop table if exists previously
+ spark.sql(s"DROP TABLE IF EXISTS sales")
+ // Create target carbon table and populate with initial data
+ spark.sql(
+   s"""
+      | CREATE TABLE sales (
+      | user_id string,
+      | country string,
+      | quantity int,
+      | price bigint)
+      | STORED BY 'carbondata'""".stripMargin)
+      
+ spark.sql(
+   s"""
+      | CREATE DATAMAP agg_sales
+      | ON TABLE sales
+      | USING "preaggregate"
+      | AS
+      | SELECT country, sum(quantity), avg(price)
+      | FROM sales
+      | GROUP BY country""".stripMargin)
+      
+ import spark.implicits._
+ import org.apache.spark.sql.SaveMode
+ import scala.util.Random
+ 
+ val r = new Random()
+ val df = spark.sparkContext.parallelize(1 to 10)
+   .map(x => ("ID." + r.nextInt(100000), "country" + x % 8, x % 50, x % 60))
+   .toDF("user_id", "country", "quantity", "price")
+
+ // Create table with pre-aggregate table
+ df.write.format("carbondata")
+   .option("tableName", "sales")
+   .option("compress", "true")
+   .mode(SaveMode.Append).save()
+      
+ spark.sql(
+      s"""
+    |SELECT country, sum(quantity), avg(price)
+    | from sales GROUP BY country""".stripMargin).show
+
+ spark.stop
+```
+
+##PRE-AGGREGATE TABLES  
+  Carbondata supports pre aggregating of data so that OLAP kind of queries can fetch data 
+  much faster.Aggregate tables are created as datamaps so that the handling is as efficient as 
+  other indexing support.Users can create as many aggregate tables they require as datamaps to 
+  improve their query performance,provided the storage requirements and loading speeds are 
+  acceptable.
+  
+  For main table called **sales** which is defined as 
+  
+  ```
+  CREATE TABLE sales (
+  order_time timestamp,
+  user_id string,
+  sex string,
+  country string,
+  quantity int,
+  price bigint)
+  STORED BY 'carbondata'
+  ```
+  
+  user can create pre-aggregate tables using the DDL
+  
+  ```
+  CREATE DATAMAP agg_sales
+  ON TABLE sales
+  USING "preaggregate"
+  AS
+  SELECT country, sex, sum(quantity), avg(price)
+  FROM sales
+  GROUP BY country, sex
+  ```
+  
+
+  
+<b><p align="left">Functions supported in pre-aggregate tables</p></b>
+
+| Function | Rollup supported |
+|-----------|----------------|
+| SUM | Yes |
+| AVG | Yes |
+| MAX | Yes |
+| MIN | Yes |
+| COUNT | Yes |
+
+
+##### How pre-aggregate tables are selected
+For the main table **sales** and pre-aggregate table **agg_sales** created above, queries of the 
+kind
+```
+SELECT country, sex, sum(quantity), avg(price) from sales GROUP BY country, sex
+
+SELECT sex, sum(quantity) from sales GROUP BY sex
+
+SELECT sum(price), country from sales GROUP BY country
+``` 
+
+will be transformed by Query Planner to fetch data from pre-aggregate table **agg_sales**
+
+But queries of kind
+```
+SELECT user_id, country, sex, sum(quantity), avg(price) from sales GROUP BY user_id, country, sex
+
+SELECT sex, avg(quantity) from sales GROUP BY sex
+
+SELECT country, max(price) from sales GROUP BY country
+```
+
+will fetch the data from the main table **sales**
+
+##### Loading data to pre-aggregate tables
+For existing table with loaded data, data load to pre-aggregate table will be triggered by the 
+CREATE DATAMAP statement when user creates the pre-aggregate table.
+For incremental loads after aggregates tables are created, loading data to main table triggers 
+the load to pre-aggregate tables once main table loading is complete. These loads are automic 
+meaning that data on main table and aggregate tables are only visible to the user after all tables 
+are loaded
+
+##### Querying data from pre-aggregate tables
+Pre-aggregate tables cannot be queries directly. Queries are to be made on main table. Internally 
+carbondata will check associated pre-aggregate tables with the main table, and if the 
+pre-aggregate tables satisfy the query condition, the plan is transformed automatically to use 
+pre-aggregate table to fetch the data.
+
+##### Compacting pre-aggregate tables
+Compaction command (ALTER TABLE COMPACT) need to be run separately on each pre-aggregate table.
+Running Compaction command on main table will **not automatically** compact the pre-aggregate 
+tables.Compaction is an optional operation for pre-aggregate table. If compaction is performed on
+main table but not performed on pre-aggregate table, all queries still can benefit from 
+pre-aggregate tables. To further improve performance on pre-aggregate tables, compaction can be 
+triggered on pre-aggregate tables directly, it will merge the segments inside pre-aggregate table. 
+
+##### Update/Delete Operations on pre-aggregate tables
+This functionality is not supported.
+
+  NOTE (<b>RESTRICTION</b>):
+  Update/Delete operations are <b>not supported</b> on main table which has pre-aggregate tables 
+  created on it. All the pre-aggregate tables <b>will have to be dropped</b> before update/delete 
+  operations can be performed on the main table. Pre-aggregate tables can be rebuilt manually 
+  after update/delete operations are completed
+ 
+##### Delete Segment Operations on pre-aggregate tables
+This functionality is not supported.
+
+  NOTE (<b>RESTRICTION</b>):
+  Delete Segment operations are <b>not supported</b> on main table which has pre-aggregate tables 
+  created on it. All the pre-aggregate tables <b>will have to be dropped</b> before update/delete 
+  operations can be performed on the main table. Pre-aggregate tables can be rebuilt manually 
+  after delete segment operations are completed
+  
+##### Alter Table Operations on pre-aggregate tables
+This functionality is not supported.
+
+  NOTE (<b>RESTRICTION</b>):
+  Adding new column in new table does not have any affect on pre-aggregate tables. However if 
+  dropping or renaming a column has impact in pre-aggregate table, such operations will be 
+  rejected and error will be thrown. All the pre-aggregate tables <b>will have to be dropped</b> 
+  before Alter Operations can be performed on the main table. Pre-aggregate tables can be rebuilt 
+  manually after Alter Table operations are completed
+  
+### Supporting timeseries data (Alpha feature in 1.3.0)
+Carbondata has built-in understanding of time hierarchy and levels: year, month, day, hour, minute.
+Multiple pre-aggregate tables can be created for the hierarchy and Carbondata can do automatic 
+roll-up for the queries on these hierarchies.
+
+  ```
+  CREATE DATAMAP agg_year
+  ON TABLE sales
+  USING "timeseries"
+  DMPROPERTIES (
+  'event_time'='order_time',
+  'year_granualrity'='1',
+  ) AS
+  SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+   avg(price) FROM sales GROUP BY order_time, country, sex
+    
+  CREATE DATAMAP agg_month
+  ON TABLE sales
+  USING "timeseries"
+  DMPROPERTIES (
+  'event_time'='order_time',
+  'month_granualrity'='1',
+  ) AS
+  SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+   avg(price) FROM sales GROUP BY order_time, country, sex
+    
+  CREATE DATAMAP agg_day
+  ON TABLE sales
+  USING "timeseries"
+  DMPROPERTIES (
+  'event_time'='order_time',
+  'day_granualrity'='1',
+  ) AS
+  SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+   avg(price) FROM sales GROUP BY order_time, country, sex
+        
+  CREATE DATAMAP agg_sales_hour
+  ON TABLE sales
+  USING "timeseries"
+  DMPROPERTIES (
+  'event_time'='order_time',
+  'hour_granualrity'='1',
+  ) AS
+  SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+   avg(price) FROM sales GROUP BY order_time, country, sex
+  
+  CREATE DATAMAP agg_minute
+  ON TABLE sales
+  USING "timeseries"
+  DMPROPERTIES (
+  'event_time'='order_time',
+  'minute_granualrity'='1',
+  ) AS
+  SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+   avg(price) FROM sales GROUP BY order_time, country, sex
+    
+  CREATE DATAMAP agg_minute
+  ON TABLE sales
+  USING "timeseries"
+  DMPROPERTIES (
+  'event_time'='order_time',
+  'minute_granualrity'='1',
+  ) AS
+  SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+   avg(price) FROM sales GROUP BY order_time, country, sex
+  ```
+  
+  For Querying data and automatically roll-up to the desired aggregation level,Carbondata supports 
+  UDF as
+  ```
+  timeseries(timeseries column name, 'aggregation level')
+  ```
+  ```
+  Select timeseries(order_time, 'hour'), sum(quantity) from sales group by timeseries(order_time,
+  'hour')
+  ```
+  
+  It is **not necessary** to create pre-aggregate tables for each granularity unless required for 
+  query. Carbondata can roll-up the data and fetch it.
+   
+  For Example: For main table **sales** , If pre-aggregate tables were created as  
+  
+  ```
+  CREATE DATAMAP agg_day
+    ON TABLE sales
+    USING "timeseries"
+    DMPROPERTIES (
+    'event_time'='order_time',
+    'day_granualrity'='1',
+    ) AS
+    SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+     avg(price) FROM sales GROUP BY order_time, country, sex
+          
+    CREATE DATAMAP agg_sales_hour
+    ON TABLE sales
+    USING "timeseries"
+    DMPROPERTIES (
+    'event_time'='order_time',
+    'hour_granualrity'='1',
+    ) AS
+    SELECT order_time, country, sex, sum(quantity), max(quantity), count(user_id), sum(price),
+     avg(price) FROM sales GROUP BY order_time, country, sex
+  ```
+  
+  Queries like below will be rolled-up and fetched from pre-aggregate tables
+  ```
+  Select timeseries(order_time, 'month'), sum(quantity) from sales group by timeseries(order_time,
+    'month')
+    
+  Select timeseries(order_time, 'year'), sum(quantity) from sales group by timeseries(order_time,
+    'year')
+  ```
+  
+  NOTE (<b>RESTRICTION</b>):
+  * Only value of 1 is supported for hierarchy levels. Other hierarchy levels are not supported. 
+  Other hierarchy levels are not supported
+  * pre-aggregate tables for the desired levels needs to be created one after the other
+  * pre-aggregate tables created for each level needs to be dropped separately 
+    

http://git-wip-us.apache.org/repos/asf/carbondata/blob/ec893412/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateTableExample.scala
----------------------------------------------------------------------
diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateTableExample.scala b/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateTableExample.scala
index d27eefb..ace3dcc 100644
--- a/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateTableExample.scala
+++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateTableExample.scala
@@ -38,6 +38,7 @@ object PreAggregateTableExample {
 
     // 1. simple usage for Pre-aggregate tables creation and query
     spark.sql("DROP TABLE IF EXISTS mainTable")
+    spark.sql("DROP TABLE IF EXISTS mainTable_other")
     spark.sql("""
                 | CREATE TABLE mainTable
                 | (id Int,
@@ -47,10 +48,23 @@ object PreAggregateTableExample {
                 | STORED BY 'org.apache.carbondata.format'
               """.stripMargin)
 
+    spark.sql("""
+                | CREATE TABLE mainTable_other
+                | (id Int,
+                | name String,
+                | city String,
+                | age Int)
+                | STORED BY 'org.apache.carbondata.format'
+              """.stripMargin)
+
     spark.sql(s"""
        LOAD DATA LOCAL INPATH '$testData' into table mainTable
        """)
 
+    spark.sql(s"""
+       LOAD DATA LOCAL INPATH '$testData' into table mainTable_other
+       """)
+
     spark.sql(
       s"""create datamap preagg_sum on table mainTable using 'preaggregate' as
          | select id,sum(age) from mainTable group by id"""
@@ -59,14 +73,17 @@ object PreAggregateTableExample {
       s"""create datamap preagg_avg on table mainTable using 'preaggregate' as
          | select id,avg(age) from mainTable group by id"""
         .stripMargin)
+
     spark.sql(
-      s"""create datamap preagg_count on table mainTable using 'preaggregate' as
+      s"""create datamap preagg_count_age on table mainTable using 'preaggregate' as
          | select id,count(age) from mainTable group by id"""
         .stripMargin)
+
     spark.sql(
       s"""create datamap preagg_min on table mainTable using 'preaggregate' as
          | select id,min(age) from mainTable group by id"""
         .stripMargin)
+
     spark.sql(
       s"""create datamap preagg_max on table mainTable using 'preaggregate' as
          | select id,max(age) from mainTable group by id"""
@@ -74,10 +91,40 @@ object PreAggregateTableExample {
 
     spark.sql(
       s"""
+         | create datamap preagg_case on table mainTable using 'preaggregate' as
+         | select name,sum(case when age=35 then id else 0 end) from mainTable group by name
+         | """.stripMargin)
+
+    spark.sql(
+      s"""create datamap preagg_count on table maintable using 'preaggregate' as
+         | select name, count(*) from maintable group by name""".stripMargin)
+
+    spark.sql(
+      s"""
          | SELECT id,max(age)
          | FROM mainTable group by id
       """.stripMargin).show()
 
+    spark.sql(
+      s"""
+         | select name, count(*) from
+         | mainTable group by name
+      """.stripMargin).show()
+
+    spark.sql(
+      s"""
+         | select name as NewName,
+         | sum(case when age=35 then id else 0 end) as sum
+         | from mainTable group by name order by name
+      """.stripMargin).show()
+
+    spark.sql(
+      s"""
+         | select t1.name,t1.city from mainTable_other t1 join
+         | (select name as newnewname,sum(age) as sum
+         | from mainTable group by name )t2 on t1.name=t2.newnewname
+      """.stripMargin).show()
+
     // 2.compare the performance : with pre-aggregate VS main table
 
     // build test data, if set the data is larger than 100M, it will take 10+ mins.
@@ -160,6 +207,7 @@ object PreAggregateTableExample {
     // scalastyle:on
 
     spark.sql("DROP TABLE IF EXISTS mainTable")
+    spark.sql("DROP TABLE IF EXISTS mainTable_other")
     spark.sql("DROP TABLE IF EXISTS personTable")
     spark.sql("DROP TABLE IF EXISTS personTableWithoutAgg")

[23/25] carbondata git commit: [CARBONDATA-2204] Optimized number of reads of tablestatus file while querying

Posted by ra...@apache.org.

[CARBONDATA-2204] Optimized number of reads of tablestatus file while querying

This PR avoid reading status file multiple times. For first time query, it reads 2 times(Needed for datamap refresher) and 1 time for second query onwards.

This closes #1999


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/a816e0c0
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/a816e0c0
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/a816e0c0

Branch: refs/heads/branch-1.3
Commit: a816e0c0489892d10fb5c4d99fc5c1e60d9b8297
Parents: 5b0b503
Author: ravipesala <ra...@gmail.com>
Authored: Mon Feb 26 18:36:03 2018 +0530
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 18:06:07 2018 +0530

----------------------------------------------------------------------
 .../core/constants/CarbonCommonConstants.java   |   4 -
 .../statusmanager/SegmentStatusManager.java     | 122 +++++++++----------
 .../SegmentUpdateStatusManager.java             |  28 +++--
 .../core/util/path/CarbonTablePath.java         |   9 +-
 .../hadoop/api/CarbonTableInputFormat.java      |  24 ++--
 .../hadoop/test/util/StoreCreator.java          |   8 +-
 .../presto/util/CarbonDataStoreCreator.scala    |   2 +-
 .../carbondata/processing/StoreCreator.java     |   4 +-
 8 files changed, 108 insertions(+), 93 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/a816e0c0/core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java b/core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java
index fa2b7d8..b2a3375 100644
--- a/core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java
+++ b/core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java
@@ -650,10 +650,6 @@ public final class CarbonCommonConstants {
    */
   public static final int DEFAULT_MAX_QUERY_EXECUTION_TIME = 60;
   /**
-   * LOADMETADATA_FILENAME
-   */
-  public static final String LOADMETADATA_FILENAME = "tablestatus";
-  /**
    * TABLE UPDATE STATUS FILENAME
    */
   public static final String TABLEUPDATESTATUS_FILENAME = "tableupdatestatus";

http://git-wip-us.apache.org/repos/asf/carbondata/blob/a816e0c0/core/src/main/java/org/apache/carbondata/core/statusmanager/SegmentStatusManager.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/statusmanager/SegmentStatusManager.java b/core/src/main/java/org/apache/carbondata/core/statusmanager/SegmentStatusManager.java
index 76c2dc7..ab849ce 100755
--- a/core/src/main/java/org/apache/carbondata/core/statusmanager/SegmentStatusManager.java
+++ b/core/src/main/java/org/apache/carbondata/core/statusmanager/SegmentStatusManager.java
@@ -98,6 +98,15 @@ public class SegmentStatusManager {
    * @throws IOException
    */
   public ValidAndInvalidSegmentsInfo getValidAndInvalidSegments() throws IOException {
+    return getValidAndInvalidSegments(null);
+  }
+
+  /**
+   * get valid segment for given load status details.
+   *
+   */
+  public ValidAndInvalidSegmentsInfo getValidAndInvalidSegments(
+      LoadMetadataDetails[] loadMetadataDetails) throws IOException {
 
     // @TODO: move reading LoadStatus file to separate class
     List<Segment> listOfValidSegments = new ArrayList<>(10);
@@ -108,73 +117,56 @@ public class SegmentStatusManager {
     CarbonTablePath carbonTablePath = CarbonStorePath
         .getCarbonTablePath(absoluteTableIdentifier.getTablePath(),
             absoluteTableIdentifier.getCarbonTableIdentifier());
-    String dataPath = carbonTablePath.getTableStatusFilePath();
-    DataInputStream dataInputStream = null;
 
-    // Use GSON to deserialize the load information
-    Gson gson = new Gson();
-
-    AtomicFileOperations fileOperation =
-        new AtomicFileOperationsImpl(dataPath, FileFactory.getFileType(dataPath));
-    LoadMetadataDetails[] loadFolderDetailsArray;
     try {
-      if (FileFactory.isFileExist(dataPath, FileFactory.getFileType(dataPath))) {
-        dataInputStream = fileOperation.openForRead();
-        BufferedReader buffReader =
-            new BufferedReader(new InputStreamReader(dataInputStream, "UTF-8"));
-        loadFolderDetailsArray = gson.fromJson(buffReader, LoadMetadataDetails[].class);
-        // if loadFolderDetailsArray is null, assign a empty array
-        if (null == loadFolderDetailsArray) {
-          loadFolderDetailsArray = new LoadMetadataDetails[0];
-        }
-        //just directly iterate Array
-        for (LoadMetadataDetails segment : loadFolderDetailsArray) {
-          if (SegmentStatus.SUCCESS == segment.getSegmentStatus()
-              || SegmentStatus.MARKED_FOR_UPDATE == segment.getSegmentStatus()
-              || SegmentStatus.LOAD_PARTIAL_SUCCESS == segment.getSegmentStatus()
-              || SegmentStatus.STREAMING == segment.getSegmentStatus()
-              || SegmentStatus.STREAMING_FINISH == segment.getSegmentStatus()) {
-            // check for merged loads.
-            if (null != segment.getMergedLoadName()) {
-              Segment seg = new Segment(segment.getMergedLoadName(), segment.getSegmentFile());
-              if (!listOfValidSegments.contains(seg)) {
-                listOfValidSegments.add(seg);
-              }
-              // if merged load is updated then put it in updated list
-              if (SegmentStatus.MARKED_FOR_UPDATE == segment.getSegmentStatus()) {
-                listOfValidUpdatedSegments.add(seg);
-              }
-              continue;
+      if (loadMetadataDetails == null) {
+        loadMetadataDetails = readTableStatusFile(carbonTablePath.getTableStatusFilePath());
+      }
+      //just directly iterate Array
+      for (LoadMetadataDetails segment : loadMetadataDetails) {
+        if (SegmentStatus.SUCCESS == segment.getSegmentStatus()
+            || SegmentStatus.MARKED_FOR_UPDATE == segment.getSegmentStatus()
+            || SegmentStatus.LOAD_PARTIAL_SUCCESS == segment.getSegmentStatus()
+            || SegmentStatus.STREAMING == segment.getSegmentStatus()
+            || SegmentStatus.STREAMING_FINISH == segment.getSegmentStatus()) {
+          // check for merged loads.
+          if (null != segment.getMergedLoadName()) {
+            Segment seg = new Segment(segment.getMergedLoadName(), segment.getSegmentFile());
+            if (!listOfValidSegments.contains(seg)) {
+              listOfValidSegments.add(seg);
             }
-
+            // if merged load is updated then put it in updated list
             if (SegmentStatus.MARKED_FOR_UPDATE == segment.getSegmentStatus()) {
-
-              listOfValidUpdatedSegments
-                  .add(new Segment(segment.getLoadName(), segment.getSegmentFile()));
-            }
-            if (SegmentStatus.STREAMING == segment.getSegmentStatus()
-                || SegmentStatus.STREAMING_FINISH == segment.getSegmentStatus()) {
-              listOfStreamSegments
-                  .add(new Segment(segment.getLoadName(), segment.getSegmentFile()));
-              continue;
+              listOfValidUpdatedSegments.add(seg);
             }
-            listOfValidSegments.add(new Segment(segment.getLoadName(), segment.getSegmentFile()));
-          } else if ((SegmentStatus.LOAD_FAILURE == segment.getSegmentStatus()
-              || SegmentStatus.COMPACTED == segment.getSegmentStatus()
-              || SegmentStatus.MARKED_FOR_DELETE == segment.getSegmentStatus())) {
-            listOfInvalidSegments.add(new Segment(segment.getLoadName(), segment.getSegmentFile()));
-          } else if (SegmentStatus.INSERT_IN_PROGRESS == segment.getSegmentStatus() ||
-              SegmentStatus.INSERT_OVERWRITE_IN_PROGRESS == segment.getSegmentStatus()) {
-            listOfInProgressSegments
+            continue;
+          }
+
+          if (SegmentStatus.MARKED_FOR_UPDATE == segment.getSegmentStatus()) {
+
+            listOfValidUpdatedSegments
                 .add(new Segment(segment.getLoadName(), segment.getSegmentFile()));
           }
+          if (SegmentStatus.STREAMING == segment.getSegmentStatus()
+              || SegmentStatus.STREAMING_FINISH == segment.getSegmentStatus()) {
+            listOfStreamSegments
+                .add(new Segment(segment.getLoadName(), segment.getSegmentFile()));
+            continue;
+          }
+          listOfValidSegments.add(new Segment(segment.getLoadName(), segment.getSegmentFile()));
+        } else if ((SegmentStatus.LOAD_FAILURE == segment.getSegmentStatus()
+            || SegmentStatus.COMPACTED == segment.getSegmentStatus()
+            || SegmentStatus.MARKED_FOR_DELETE == segment.getSegmentStatus())) {
+          listOfInvalidSegments.add(new Segment(segment.getLoadName(), segment.getSegmentFile()));
+        } else if (SegmentStatus.INSERT_IN_PROGRESS == segment.getSegmentStatus() ||
+            SegmentStatus.INSERT_OVERWRITE_IN_PROGRESS == segment.getSegmentStatus()) {
+          listOfInProgressSegments
+              .add(new Segment(segment.getLoadName(), segment.getSegmentFile()));
         }
       }
     } catch (IOException e) {
       LOG.error(e);
       throw e;
-    } finally {
-      CarbonUtil.closeStreams(dataInputStream);
     }
     return new ValidAndInvalidSegmentsInfo(listOfValidSegments, listOfValidUpdatedSegments,
         listOfInvalidSegments, listOfStreamSegments, listOfInProgressSegments);
@@ -188,26 +180,32 @@ public class SegmentStatusManager {
    */
   public static LoadMetadataDetails[] readLoadMetadata(String metadataFolderPath) {
     String metadataFileName = metadataFolderPath + CarbonCommonConstants.FILE_SEPARATOR
-        + CarbonCommonConstants.LOADMETADATA_FILENAME;
-    return readTableStatusFile(metadataFileName);
+        + CarbonTablePath.TABLE_STATUS_FILE;
+    try {
+      return readTableStatusFile(metadataFileName);
+    } catch (IOException e) {
+      return new LoadMetadataDetails[0];
+    }
   }
 
   /**
    * Reads the table status file with the specified UUID if non empty.
    */
-  public static LoadMetadataDetails[] readLoadMetadata(String metaDataFolderPath, String uuid) {
+  public static LoadMetadataDetails[] readLoadMetadata(String metaDataFolderPath, String uuid)
+      throws IOException {
     String tableStatusFileName;
     if (uuid.isEmpty()) {
       tableStatusFileName = metaDataFolderPath + CarbonCommonConstants.FILE_SEPARATOR
-          + CarbonCommonConstants.LOADMETADATA_FILENAME;
+          + CarbonTablePath.TABLE_STATUS_FILE;
     } else {
       tableStatusFileName = metaDataFolderPath + CarbonCommonConstants.FILE_SEPARATOR
-          + CarbonCommonConstants.LOADMETADATA_FILENAME + CarbonCommonConstants.UNDERSCORE + uuid;
+          + CarbonTablePath.TABLE_STATUS_FILE + CarbonCommonConstants.UNDERSCORE + uuid;
     }
     return readTableStatusFile(tableStatusFileName);
   }
 
-  public static LoadMetadataDetails[] readTableStatusFile(String tableStatusPath) {
+  public static LoadMetadataDetails[] readTableStatusFile(String tableStatusPath)
+      throws IOException {
     Gson gsonObjectToRead = new Gson();
     DataInputStream dataInputStream = null;
     BufferedReader buffReader = null;
@@ -228,7 +226,7 @@ public class SegmentStatusManager {
           gsonObjectToRead.fromJson(buffReader, LoadMetadataDetails[].class);
     } catch (IOException e) {
       LOG.error(e, "Failed to read metadata of load");
-      return new LoadMetadataDetails[0];
+      throw e;
     } finally {
       closeStreams(buffReader, inStream, dataInputStream);
     }

http://git-wip-us.apache.org/repos/asf/carbondata/blob/a816e0c0/core/src/main/java/org/apache/carbondata/core/statusmanager/SegmentUpdateStatusManager.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/statusmanager/SegmentUpdateStatusManager.java b/core/src/main/java/org/apache/carbondata/core/statusmanager/SegmentUpdateStatusManager.java
index 71b6ba8..3fc2813 100644
--- a/core/src/main/java/org/apache/carbondata/core/statusmanager/SegmentUpdateStatusManager.java
+++ b/core/src/main/java/org/apache/carbondata/core/statusmanager/SegmentUpdateStatusManager.java
@@ -73,6 +73,21 @@ public class SegmentUpdateStatusManager {
   private Map<String, SegmentUpdateDetails> blockAndDetailsMap;
   private boolean isPartitionTable;
 
+  public SegmentUpdateStatusManager(AbsoluteTableIdentifier absoluteTableIdentifier,
+      LoadMetadataDetails[] segmentDetails) {
+    this.absoluteTableIdentifier = absoluteTableIdentifier;
+    carbonTablePath = CarbonStorePath.getCarbonTablePath(absoluteTableIdentifier.getTablePath(),
+        absoluteTableIdentifier.getCarbonTableIdentifier());
+    // current it is used only for read function scenarios, as file update always requires to work
+    // on latest file status.
+    this.segmentDetails = segmentDetails;
+    if (segmentDetails.length > 0) {
+      isPartitionTable = segmentDetails[0].getSegmentFile() != null;
+    }
+    updateDetails = readLoadMetadata();
+    populateMap();
+  }
+
   /**
    * @param absoluteTableIdentifier
    */
@@ -80,11 +95,10 @@ public class SegmentUpdateStatusManager {
     this.absoluteTableIdentifier = absoluteTableIdentifier;
     carbonTablePath = CarbonStorePath.getCarbonTablePath(absoluteTableIdentifier.getTablePath(),
         absoluteTableIdentifier.getCarbonTableIdentifier());
-    SegmentStatusManager segmentStatusManager = new SegmentStatusManager(absoluteTableIdentifier);
     // current it is used only for read function scenarios, as file update always requires to work
     // on latest file status.
     segmentDetails =
-        segmentStatusManager.readLoadMetadata(carbonTablePath.getMetadataDirectoryPath());
+        SegmentStatusManager.readLoadMetadata(carbonTablePath.getMetadataDirectoryPath());
     if (segmentDetails.length > 0) {
       isPartitionTable = segmentDetails[0].getSegmentFile() != null;
     }
@@ -732,16 +746,10 @@ public class SegmentUpdateStatusManager {
    * @return updateStatusFileName
    */
   private String getUpdatedStatusIdentifier() {
-    SegmentStatusManager ssm = new SegmentStatusManager(absoluteTableIdentifier);
-    CarbonTablePath carbonTablePath = CarbonStorePath
-        .getCarbonTablePath(absoluteTableIdentifier.getTablePath(),
-            absoluteTableIdentifier.getCarbonTableIdentifier());
-    LoadMetadataDetails[] loadDetails =
-        ssm.readLoadMetadata(carbonTablePath.getMetadataDirectoryPath());
-    if (loadDetails.length == 0) {
+    if (segmentDetails.length == 0) {
       return null;
     }
-    return loadDetails[0].getUpdateStatusFileName();
+    return segmentDetails[0].getUpdateStatusFileName();
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/carbondata/blob/a816e0c0/core/src/main/java/org/apache/carbondata/core/util/path/CarbonTablePath.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/util/path/CarbonTablePath.java b/core/src/main/java/org/apache/carbondata/core/util/path/CarbonTablePath.java
index b5fe5ea..cb264c4 100644
--- a/core/src/main/java/org/apache/carbondata/core/util/path/CarbonTablePath.java
+++ b/core/src/main/java/org/apache/carbondata/core/util/path/CarbonTablePath.java
@@ -38,7 +38,7 @@ public class CarbonTablePath extends Path {
   private static final String DICTIONARY_META_EXT = ".dictmeta";
   private static final String SORT_INDEX_EXT = ".sortindex";
   private static final String SCHEMA_FILE = "schema";
-  private static final String TABLE_STATUS_FILE = "tablestatus";
+  public static final String TABLE_STATUS_FILE = "tablestatus";
   private static final String FACT_DIR = "Fact";
   private static final String SEGMENT_PREFIX = "Segment_";
   private static final String PARTITION_PREFIX = "Part";
@@ -177,6 +177,13 @@ public class CarbonTablePath extends Path {
   }
 
   /**
+   * Return table status file path based on `tablePath`
+   */
+  public static String getTableStatusFilePath(String tablePath) {
+    return getMetadataPath(tablePath) + CarbonCommonConstants.FILE_SEPARATOR + TABLE_STATUS_FILE;
+  }
+
+  /**
    * @param columnId unique column identifier
    * @return absolute path of dictionary meta file
    */

http://git-wip-us.apache.org/repos/asf/carbondata/blob/a816e0c0/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonTableInputFormat.java
----------------------------------------------------------------------
diff --git a/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonTableInputFormat.java b/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonTableInputFormat.java
index 3dbf04f..f6624cd 100644
--- a/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonTableInputFormat.java
+++ b/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonTableInputFormat.java
@@ -64,6 +64,7 @@ import org.apache.carbondata.core.stats.QueryStatistic;
 import org.apache.carbondata.core.stats.QueryStatisticsConstants;
 import org.apache.carbondata.core.stats.QueryStatisticsRecorder;
 import org.apache.carbondata.core.statusmanager.FileFormat;
+import org.apache.carbondata.core.statusmanager.LoadMetadataDetails;
 import org.apache.carbondata.core.statusmanager.SegmentStatusManager;
 import org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager;
 import org.apache.carbondata.core.util.CarbonProperties;
@@ -341,7 +342,10 @@ public class CarbonTableInputFormat<T> extends FileInputFormat<Void, T> {
    */
   @Override public List<InputSplit> getSplits(JobContext job) throws IOException {
     AbsoluteTableIdentifier identifier = getAbsoluteTableIdentifier(job.getConfiguration());
-    SegmentUpdateStatusManager updateStatusManager = new SegmentUpdateStatusManager(identifier);
+    LoadMetadataDetails[] loadMetadataDetails = SegmentStatusManager
+        .readTableStatusFile(CarbonTablePath.getTableStatusFilePath(identifier.getTablePath()));
+    SegmentUpdateStatusManager updateStatusManager =
+        new SegmentUpdateStatusManager(identifier, loadMetadataDetails);
     CarbonTable carbonTable = getOrCreateCarbonTable(job.getConfiguration());
     if (null == carbonTable) {
       throw new IOException("Missing/Corrupt schema file for table.");
@@ -355,7 +359,7 @@ public class CarbonTableInputFormat<T> extends FileInputFormat<Void, T> {
     // get all valid segments and set them into the configuration
     SegmentStatusManager segmentStatusManager = new SegmentStatusManager(identifier);
     SegmentStatusManager.ValidAndInvalidSegmentsInfo segments =
-        segmentStatusManager.getValidAndInvalidSegments();
+        segmentStatusManager.getValidAndInvalidSegments(loadMetadataDetails);
 
     if (getValidateSegmentsToAccess(job.getConfiguration())) {
       List<Segment> validSegments = segments.getValidSegments();
@@ -435,7 +439,7 @@ public class CarbonTableInputFormat<T> extends FileInputFormat<Void, T> {
     // do block filtering and get split
     List<InputSplit> splits =
         getSplits(job, filterInterface, filteredSegmentToAccess, matchedPartitions, partitionInfo,
-            null);
+            null, updateStatusManager);
     // pass the invalid segment to task side in order to remove index entry in task side
     if (invalidSegments.size() > 0) {
       for (InputSplit split : splits) {
@@ -616,7 +620,7 @@ public class CarbonTableInputFormat<T> extends FileInputFormat<Void, T> {
           CarbonInputFormatUtil.resolveFilter(filter, identifier, tableProvider);
       // do block filtering and get split
       List<InputSplit> splits = getSplits(job, filterInterface, segmentList, matchedPartitions,
-          partitionInfo, oldPartitionIdList);
+          partitionInfo, oldPartitionIdList, new SegmentUpdateStatusManager(identifier));
       // pass the invalid segment to task side in order to remove index entry in task side
       if (invalidSegments.size() > 0) {
         for (InputSplit split : splits) {
@@ -667,7 +671,8 @@ public class CarbonTableInputFormat<T> extends FileInputFormat<Void, T> {
    */
   private List<InputSplit> getSplits(JobContext job, FilterResolverIntf filterResolver,
       List<Segment> validSegments, BitSet matchedPartitions, PartitionInfo partitionInfo,
-      List<Integer> oldPartitionIdList) throws IOException {
+      List<Integer> oldPartitionIdList, SegmentUpdateStatusManager updateStatusManager)
+      throws IOException {
 
     List<InputSplit> result = new LinkedList<InputSplit>();
     UpdateVO invalidBlockVOForSegmentId = null;
@@ -675,8 +680,6 @@ public class CarbonTableInputFormat<T> extends FileInputFormat<Void, T> {
 
     AbsoluteTableIdentifier absoluteTableIdentifier =
         getOrCreateCarbonTable(job.getConfiguration()).getAbsoluteTableIdentifier();
-    SegmentUpdateStatusManager updateStatusManager =
-        new SegmentUpdateStatusManager(absoluteTableIdentifier);
 
     isIUDTable = (updateStatusManager.getUpdateStatusDetails().length != 0);
 
@@ -930,9 +933,12 @@ public class CarbonTableInputFormat<T> extends FileInputFormat<Void, T> {
       List<PartitionSpec> partitions) throws IOException {
     TableDataMap blockletMap = DataMapStoreManager.getInstance()
         .getDataMap(identifier, BlockletDataMap.NAME, BlockletDataMapFactory.class.getName());
-    SegmentUpdateStatusManager updateStatusManager = new SegmentUpdateStatusManager(identifier);
+    LoadMetadataDetails[] loadMetadataDetails = SegmentStatusManager
+        .readTableStatusFile(CarbonTablePath.getTableStatusFilePath(identifier.getTablePath()));
+    SegmentUpdateStatusManager updateStatusManager =
+        new SegmentUpdateStatusManager(identifier, loadMetadataDetails);
     SegmentStatusManager.ValidAndInvalidSegmentsInfo allSegments =
-        new SegmentStatusManager(identifier).getValidAndInvalidSegments();
+        new SegmentStatusManager(identifier).getValidAndInvalidSegments(loadMetadataDetails);
     Map<String, Long> blockRowCountMapping = new HashMap<>();
     Map<String, Long> segmentAndBlockCountMapping = new HashMap<>();
 

http://git-wip-us.apache.org/repos/asf/carbondata/blob/a816e0c0/hadoop/src/test/java/org/apache/carbondata/hadoop/test/util/StoreCreator.java
----------------------------------------------------------------------
diff --git a/hadoop/src/test/java/org/apache/carbondata/hadoop/test/util/StoreCreator.java b/hadoop/src/test/java/org/apache/carbondata/hadoop/test/util/StoreCreator.java
index fbf33d6..7e58f97 100644
--- a/hadoop/src/test/java/org/apache/carbondata/hadoop/test/util/StoreCreator.java
+++ b/hadoop/src/test/java/org/apache/carbondata/hadoop/test/util/StoreCreator.java
@@ -74,15 +74,15 @@ import org.apache.carbondata.core.writer.sortindex.CarbonDictionarySortIndexWrit
 import org.apache.carbondata.core.writer.sortindex.CarbonDictionarySortIndexWriterImpl;
 import org.apache.carbondata.core.writer.sortindex.CarbonDictionarySortInfo;
 import org.apache.carbondata.core.writer.sortindex.CarbonDictionarySortInfoPreparator;
-import org.apache.carbondata.processing.util.TableOptionConstant;
+import org.apache.carbondata.processing.loading.DataLoadExecutor;
+import org.apache.carbondata.processing.loading.constants.DataLoadProcessorConstants;
 import org.apache.carbondata.processing.loading.csvinput.BlockDetails;
 import org.apache.carbondata.processing.loading.csvinput.CSVInputFormat;
 import org.apache.carbondata.processing.loading.csvinput.CSVRecordReaderIterator;
 import org.apache.carbondata.processing.loading.csvinput.StringArrayWritable;
 import org.apache.carbondata.processing.loading.model.CarbonDataLoadSchema;
 import org.apache.carbondata.processing.loading.model.CarbonLoadModel;
-import org.apache.carbondata.processing.loading.DataLoadExecutor;
-import org.apache.carbondata.processing.loading.constants.DataLoadProcessorConstants;
+import org.apache.carbondata.processing.util.TableOptionConstant;
 
 import com.google.gson.Gson;
 import org.apache.hadoop.conf.Configuration;
@@ -471,7 +471,7 @@ public class StoreCreator {
     listOfLoadFolderDetails.add(loadMetadataDetails);
 
     String dataLoadLocation = schema.getCarbonTable().getMetaDataFilepath() + File.separator
-        + CarbonCommonConstants.LOADMETADATA_FILENAME;
+        + CarbonTablePath.TABLE_STATUS_FILE;
 
     DataOutputStream dataOutputStream;
     Gson gsonObjectToWrite = new Gson();

http://git-wip-us.apache.org/repos/asf/carbondata/blob/a816e0c0/integration/presto/src/test/scala/org/apache/carbondata/presto/util/CarbonDataStoreCreator.scala
----------------------------------------------------------------------
diff --git a/integration/presto/src/test/scala/org/apache/carbondata/presto/util/CarbonDataStoreCreator.scala b/integration/presto/src/test/scala/org/apache/carbondata/presto/util/CarbonDataStoreCreator.scala
index 7b5c311..9d82d42 100644
--- a/integration/presto/src/test/scala/org/apache/carbondata/presto/util/CarbonDataStoreCreator.scala
+++ b/integration/presto/src/test/scala/org/apache/carbondata/presto/util/CarbonDataStoreCreator.scala
@@ -535,7 +535,7 @@ object CarbonDataStoreCreator {
         loadMetadataDetails.getTimeStamp(readCurrentTime()))
       listOfLoadFolderDetails.add(loadMetadataDetails)
       val dataLoadLocation: String = schema.getCarbonTable.getMetaDataFilepath + File.separator +
-                                     CarbonCommonConstants.LOADMETADATA_FILENAME
+                                     CarbonTablePath.TABLE_STATUS_FILE
       val gsonObjectToWrite: Gson = new Gson()
       val writeOperation: AtomicFileOperations = new AtomicFileOperationsImpl(
         dataLoadLocation,

http://git-wip-us.apache.org/repos/asf/carbondata/blob/a816e0c0/processing/src/test/java/org/apache/carbondata/processing/StoreCreator.java
----------------------------------------------------------------------
diff --git a/processing/src/test/java/org/apache/carbondata/processing/StoreCreator.java b/processing/src/test/java/org/apache/carbondata/processing/StoreCreator.java
index e662757..e93227d 100644
--- a/processing/src/test/java/org/apache/carbondata/processing/StoreCreator.java
+++ b/processing/src/test/java/org/apache/carbondata/processing/StoreCreator.java
@@ -43,6 +43,7 @@ import org.apache.carbondata.core.datastore.impl.FileFactory;
 import org.apache.carbondata.core.fileoperations.AtomicFileOperations;
 import org.apache.carbondata.core.fileoperations.AtomicFileOperationsImpl;
 import org.apache.carbondata.core.fileoperations.FileWriteOperation;
+import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier;
 import org.apache.carbondata.core.metadata.CarbonMetadata;
 import org.apache.carbondata.core.metadata.CarbonTableIdentifier;
 import org.apache.carbondata.core.metadata.ColumnIdentifier;
@@ -52,7 +53,6 @@ import org.apache.carbondata.core.metadata.datatype.DataTypes;
 import org.apache.carbondata.core.metadata.encoder.Encoding;
 import org.apache.carbondata.core.metadata.schema.SchemaEvolution;
 import org.apache.carbondata.core.metadata.schema.SchemaEvolutionEntry;
-import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier;
 import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
 import org.apache.carbondata.core.metadata.schema.table.TableInfo;
 import org.apache.carbondata.core.metadata.schema.table.TableSchema;
@@ -446,7 +446,7 @@ public class StoreCreator {
     listOfLoadFolderDetails.add(loadMetadataDetails);
 
     String dataLoadLocation = schema.getCarbonTable().getMetaDataFilepath() + File.separator
-        + CarbonCommonConstants.LOADMETADATA_FILENAME;
+        + CarbonTablePath.TABLE_STATUS_FILE;
 
     DataOutputStream dataOutputStream;
     Gson gsonObjectToWrite = new Gson();

[19/25] carbondata git commit: [CARBONDATA-2211] in case of DDL HandOff should not be execute in thread

Posted by ra...@apache.org.

[CARBONDATA-2211] in case of DDL HandOff should not be execute in thread

1. DDL handoff will be executed in the blocking thread.
2. Auto handoff will be executed in a new non-blocking thread.

This closes #2008


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/6d3105bb
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/6d3105bb
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/6d3105bb

Branch: refs/heads/branch-1.3
Commit: 6d3105bbc743ad868a54db5c8d2e7fbafbddaf02
Parents: 6bb5a2b
Author: rahulforallp <ra...@knoldus.in>
Authored: Tue Feb 27 21:50:20 2018 +0530
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 18:04:48 2018 +0530

----------------------------------------------------------------------
 .../CarbonAlterTableCompactionCommand.scala        |  2 +-
 .../carbondata/streaming/StreamHandoffRDD.scala    | 17 +++++++++++------
 .../streaming/CarbonAppendableStreamSink.scala     |  3 ++-
 3 files changed, 14 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/6d3105bb/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonAlterTableCompactionCommand.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonAlterTableCompactionCommand.scala b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonAlterTableCompactionCommand.scala
index f6019e4..9b9ca0e 100644
--- a/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonAlterTableCompactionCommand.scala
+++ b/integration/spark2/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonAlterTableCompactionCommand.scala
@@ -181,7 +181,7 @@ case class CarbonAlterTableCompactionCommand(
     if (compactionType == CompactionType.STREAMING) {
       StreamHandoffRDD.startStreamingHandoffThread(
         carbonLoadModel,
-        sqlContext.sparkSession)
+        sqlContext.sparkSession, true)
       return
     }
 

http://git-wip-us.apache.org/repos/asf/carbondata/blob/6d3105bb/streaming/src/main/scala/org/apache/carbondata/streaming/StreamHandoffRDD.scala
----------------------------------------------------------------------
diff --git a/streaming/src/main/scala/org/apache/carbondata/streaming/StreamHandoffRDD.scala b/streaming/src/main/scala/org/apache/carbondata/streaming/StreamHandoffRDD.scala
index b03ee1e..a46ced5 100644
--- a/streaming/src/main/scala/org/apache/carbondata/streaming/StreamHandoffRDD.scala
+++ b/streaming/src/main/scala/org/apache/carbondata/streaming/StreamHandoffRDD.scala
@@ -279,15 +279,20 @@ object StreamHandoffRDD {
    */
   def startStreamingHandoffThread(
       carbonLoadModel: CarbonLoadModel,
-      sparkSession: SparkSession
+      sparkSession: SparkSession,
+      isDDL: Boolean
   ): Unit = {
-    // start a new thread to execute streaming segment handoff
-    val handoffThread = new Thread() {
-      override def run(): Unit = {
-        iterateStreamingHandoff(carbonLoadModel, sparkSession)
+    if (isDDL) {
+      iterateStreamingHandoff(carbonLoadModel, sparkSession)
+    } else {
+      // start a new thread to execute streaming segment handoff
+      val handoffThread = new Thread() {
+        override def run(): Unit = {
+          iterateStreamingHandoff(carbonLoadModel, sparkSession)
+        }
       }
+      handoffThread.start()
     }
-    handoffThread.start()
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/carbondata/blob/6d3105bb/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala
----------------------------------------------------------------------
diff --git a/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala b/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala
index f2f9853..312d24e 100644
--- a/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala
+++ b/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala
@@ -178,7 +178,8 @@ class CarbonAppendableStreamSink(
       if (enableAutoHandoff) {
         StreamHandoffRDD.startStreamingHandoffThread(
           carbonLoadModel,
-          sparkSession)
+          sparkSession,
+          false)
       }
     }
   }

[15/25] carbondata git commit: [CARBONDATA-2147][CARBONDATA-2148][Streaming] Add new row parser: RowStreamParserImpl

Posted by ra...@apache.org.

[CARBONDATA-2147][CARBONDATA-2148][Streaming] Add new row parser: RowStreamParserImpl

Currently the default value of 'carbon.stream.parser' is CSVStreamParserImp, it transforms InternalRow(0) to Array[Object], InternalRow(0) represents the value of one line which is received from Socket. When it receives data from Kafka, the schema of InternalRow is changed, either it need to assemble the fields of kafka data Row into a String and stored it as InternalRow(0), or define a new parser to convert kafka data Row to Array[Object]. It needs the same operation for every table.

Solution:
Use a new parser called RowStreamParserImpl, this new parser will automatically convert InternalRow to Array[Object] according to the schema. In general, we will transform source data to a structed Row object, using this way, we do not need to define a parser for every table.

This closes #1959


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/566217c7
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/566217c7
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/566217c7

Branch: refs/heads/branch-1.3
Commit: 566217c7714c649475211cc3f6eafb8206bc446f
Parents: 55fe349
Author: Zhang Zhichao <44...@qq.com>
Authored: Fri Feb 9 14:49:58 2018 +0800
Committer: ravipesala <ra...@gmail.com>
Committed: Sat Mar 3 18:03:04 2018 +0530

----------------------------------------------------------------------
 .../CarbonStructuredStreamingExample.scala      |  20 +-
 ...CarbonStructuredStreamingWithRowParser.scala | 217 +++++
 .../TestStreamingTableOperation.scala           |  23 +-
 .../TestStreamingTableWithRowParser.scala       | 944 +++++++++++++++++++
 .../streaming/parser/CSVStreamParserImp.java    |   3 +-
 .../streaming/parser/CarbonStreamParser.java    |   6 +-
 .../streaming/parser/RowStreamParserImp.scala   |  72 ++
 .../streaming/CarbonAppendableStreamSink.scala  |  25 +-
 8 files changed, 1272 insertions(+), 38 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/566217c7/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStructuredStreamingExample.scala
----------------------------------------------------------------------
diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStructuredStreamingExample.scala b/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStructuredStreamingExample.scala
index 247a59b..8ce4afc 100644
--- a/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStructuredStreamingExample.scala
+++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStructuredStreamingExample.scala
@@ -23,8 +23,6 @@ import java.net.ServerSocket
 import org.apache.spark.sql.{CarbonEnv, SparkSession}
 import org.apache.spark.sql.streaming.{ProcessingTime, StreamingQuery}
 
-import org.apache.carbondata.core.constants.CarbonCommonConstants
-import org.apache.carbondata.core.util.CarbonProperties
 import org.apache.carbondata.core.util.path.{CarbonStorePath, CarbonTablePath}
 
 // scalastyle:off println
@@ -34,23 +32,9 @@ object CarbonStructuredStreamingExample {
     // setup paths
     val rootPath = new File(this.getClass.getResource("/").getPath
                             + "../../../..").getCanonicalPath
-    val storeLocation = s"$rootPath/examples/spark2/target/store"
-    val warehouse = s"$rootPath/examples/spark2/target/warehouse"
-    val metastoredb = s"$rootPath/examples/spark2/target"
-    val streamTableName = s"stream_table"
-
-    CarbonProperties.getInstance()
-      .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, "yyyy/MM/dd")
 
-    import org.apache.spark.sql.CarbonSession._
-    val spark = SparkSession
-      .builder()
-      .master("local")
-      .appName("CarbonStructuredStreamingExample")
-      .config("spark.sql.warehouse.dir", warehouse)
-      .getOrCreateCarbonSession(storeLocation, metastoredb)
-
-    spark.sparkContext.setLogLevel("ERROR")
+    val spark = ExampleUtils.createCarbonSession("CarbonStructuredStreamingExample", 4)
+    val streamTableName = s"stream_table"
 
     val requireCreateTable = true
     val useComplexDataType = false

http://git-wip-us.apache.org/repos/asf/carbondata/blob/566217c7/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStructuredStreamingWithRowParser.scala
----------------------------------------------------------------------
diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStructuredStreamingWithRowParser.scala b/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStructuredStreamingWithRowParser.scala
new file mode 100644
index 0000000..f134a8d
--- /dev/null
+++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/CarbonStructuredStreamingWithRowParser.scala
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.examples
+
+import java.io.{File, PrintWriter}
+import java.net.ServerSocket
+
+import org.apache.spark.sql.{CarbonEnv, SparkSession}
+import org.apache.spark.sql.streaming.{ProcessingTime, StreamingQuery}
+
+import org.apache.carbondata.core.util.path.{CarbonStorePath, CarbonTablePath}
+import org.apache.carbondata.streaming.parser.CarbonStreamParser
+
+case class FileElement(school: Array[String], age: Int)
+case class StreamData(id: Int, name: String, city: String, salary: Float, file: FileElement)
+
+// scalastyle:off println
+object CarbonStructuredStreamingWithRowParser {
+  def main(args: Array[String]) {
+
+    // setup paths
+    val rootPath = new File(this.getClass.getResource("/").getPath
+                            + "../../../..").getCanonicalPath
+
+    val spark = ExampleUtils.createCarbonSession("CarbonStructuredStreamingWithRowParser", 4)
+    val streamTableName = s"stream_table_with_row_parser"
+
+    val requireCreateTable = true
+    val useComplexDataType = false
+
+    if (requireCreateTable) {
+      // drop table if exists previously
+      spark.sql(s"DROP TABLE IF EXISTS ${ streamTableName }")
+      // Create target carbon table and populate with initial data
+      if (useComplexDataType) {
+        spark.sql(
+          s"""
+             | CREATE TABLE ${ streamTableName }(
+             | id INT,
+             | name STRING,
+             | city STRING,
+             | salary FLOAT,
+             | file struct<school:array<string>, age:int>
+             | )
+             | STORED BY 'carbondata'
+             | TBLPROPERTIES(
+             | 'streaming'='true', 'sort_columns'='name', 'dictionary_include'='city')
+             | """.stripMargin)
+      } else {
+        spark.sql(
+          s"""
+             | CREATE TABLE ${ streamTableName }(
+             | id INT,
+             | name STRING,
+             | city STRING,
+             | salary FLOAT
+             | )
+             | STORED BY 'carbondata'
+             | TBLPROPERTIES(
+             | 'streaming'='true', 'sort_columns'='name')
+             | """.stripMargin)
+      }
+
+      val carbonTable = CarbonEnv.getCarbonTable(Some("default"), streamTableName)(spark)
+      val tablePath = CarbonStorePath.getCarbonTablePath(carbonTable.getAbsoluteTableIdentifier)
+      // batch load
+      val path = s"$rootPath/examples/spark2/src/main/resources/streamSample.csv"
+      spark.sql(
+        s"""
+           | LOAD DATA LOCAL INPATH '$path'
+           | INTO TABLE $streamTableName
+           | OPTIONS('HEADER'='true')
+         """.stripMargin)
+
+      // streaming ingest
+      val serverSocket = new ServerSocket(7071)
+      val thread1 = startStreaming(spark, tablePath)
+      val thread2 = writeSocket(serverSocket)
+      val thread3 = showTableCount(spark, streamTableName)
+
+      System.out.println("type enter to interrupt streaming")
+      System.in.read()
+      thread1.interrupt()
+      thread2.interrupt()
+      thread3.interrupt()
+      serverSocket.close()
+    }
+
+    spark.sql(s"select count(*) from ${ streamTableName }").show(100, truncate = false)
+
+    spark.sql(s"select * from ${ streamTableName }").show(100, truncate = false)
+
+    // record(id = 100000001) comes from batch segment_0
+    // record(id = 1) comes from stream segment_1
+    spark.sql(s"select * " +
+              s"from ${ streamTableName } " +
+              s"where id = 100000001 or id = 1 limit 100").show(100, truncate = false)
+
+    // not filter
+    spark.sql(s"select * " +
+              s"from ${ streamTableName } " +
+              s"where id < 10 limit 100").show(100, truncate = false)
+
+    if (useComplexDataType) {
+      // complex
+      spark.sql(s"select file.age, file.school " +
+                s"from ${ streamTableName } " +
+                s"where where file.age = 30 ").show(100, truncate = false)
+    }
+
+    spark.stop()
+    System.out.println("streaming finished")
+  }
+
+  def showTableCount(spark: SparkSession, tableName: String): Thread = {
+    val thread = new Thread() {
+      override def run(): Unit = {
+        for (_ <- 0 to 1000) {
+          spark.sql(s"select count(*) from $tableName").show(truncate = false)
+          Thread.sleep(1000 * 3)
+        }
+      }
+    }
+    thread.start()
+    thread
+  }
+
+  def startStreaming(spark: SparkSession, tablePath: CarbonTablePath): Thread = {
+    val thread = new Thread() {
+      override def run(): Unit = {
+        var qry: StreamingQuery = null
+        try {
+          import spark.implicits._
+          val readSocketDF = spark.readStream
+            .format("socket")
+            .option("host", "localhost")
+            .option("port", 7071)
+            .load()
+            .as[String]
+            .map(_.split(","))
+            .map { fields => {
+              val tmp = fields(4).split("\\$")
+              val file = FileElement(tmp(0).split(":"), tmp(1).toInt)
+              if (fields(0).toInt % 2 == 0) {
+                StreamData(fields(0).toInt, null, fields(2), fields(3).toFloat, file)
+              } else {
+                StreamData(fields(0).toInt, fields(1), fields(2), fields(3).toFloat, file)
+              }
+            } }
+
+          // Write data from socket stream to carbondata file
+          qry = readSocketDF.writeStream
+            .format("carbondata")
+            .trigger(ProcessingTime("5 seconds"))
+            .option("checkpointLocation", tablePath.getStreamingCheckpointDir)
+            .option("dbName", "default")
+            .option("tableName", "stream_table_with_row_parser")
+            .option(CarbonStreamParser.CARBON_STREAM_PARSER,
+              "org.apache.carbondata.streaming.parser.RowStreamParserImp")
+            .start()
+
+          qry.awaitTermination()
+        } catch {
+          case ex: Exception =>
+            ex.printStackTrace()
+            println("Done reading and writing streaming data")
+        } finally {
+          qry.stop()
+        }
+      }
+    }
+    thread.start()
+    thread
+  }
+
+  def writeSocket(serverSocket: ServerSocket): Thread = {
+    val thread = new Thread() {
+      override def run(): Unit = {
+        // wait for client to connection request and accept
+        val clientSocket = serverSocket.accept()
+        val socketWriter = new PrintWriter(clientSocket.getOutputStream())
+        var index = 0
+        for (_ <- 1 to 1000) {
+          // write 5 records per iteration
+          for (_ <- 0 to 1000) {
+            index = index + 1
+            socketWriter.println(index.toString + ",name_" + index
+                                 + ",city_" + index + "," + (index * 10000.00).toString +
+                                 ",school_" + index + ":school_" + index + index + "$" + index)
+          }
+          socketWriter.flush()
+          Thread.sleep(1000)
+        }
+        socketWriter.close()
+        System.out.println("Socket closed")
+      }
+    }
+    thread.start()
+    thread
+  }
+}
+// scalastyle:on println

http://git-wip-us.apache.org/repos/asf/carbondata/blob/566217c7/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableOperation.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableOperation.scala b/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableOperation.scala
index 881af3a..a7dfabd 100644
--- a/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableOperation.scala
+++ b/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableOperation.scala
@@ -1149,7 +1149,6 @@ class TestStreamingTableOperation extends QueryTest with BeforeAndAfterAll {
         val clientSocket = serverSocket.accept()
         val socketWriter = new PrintWriter(clientSocket.getOutputStream())
         var index = 0
-        var timeRow = true
         for (_ <- 1 to writeNums) {
           // write 5 records per iteration
           val stringBuilder = new StringBuilder()
@@ -1165,20 +1164,16 @@ class TestStreamingTableOperation extends QueryTest with BeforeAndAfterAll {
                                      + ",city_" + index + "," + (10000.00 * index).toString + ",0.01,80.01" +
                                      ",1990-01-01,2010-01-01 10:01:01,2010-01-01 10:01:01" +
                                      ",school_" + index + ":school_" + index + index + "$" + index)
+              } else if (index == 9) {
+                stringBuilder.append(index.toString + ",name_" + index
+                                     + ",city_" + index + "," + (10000.00 * index).toString + ",0.04,80.04" +
+                                     ",1990-01-04,2010-01-04 10:01:01,2010-01-04 10:01:01" +
+                                     ",school_" + index + ":school_" + index + index + "$" + index)
               } else {
-
-                if (index == 9 && timeRow) {
-                  timeRow = false
-                  stringBuilder.append(index.toString + ",name_" + index
-                                       + ",city_" + index + "," + (10000.00 * index).toString + ",0.04,80.04" +
-                                       ",1990-01-04,2010-01-04 10:01:01,2010-01-04 10:01:01" +
-                                       ",school_" + index + ":school_" + index + index + "$" + index)
-                } else {
-                  stringBuilder.append(index.toString + ",name_" + index
-                                       + ",city_" + index + "," + (10000.00 * index).toString + ",0.01,80.01" +
-                                       ",1990-01-01,2010-01-01 10:01:01,2010-01-01 10:01:01" +
-                                       ",school_" + index + ":school_" + index + index + "$" + index)
-                }
+                stringBuilder.append(index.toString + ",name_" + index
+                                     + ",city_" + index + "," + (10000.00 * index).toString + ",0.01,80.01" +
+                                     ",1990-01-01,2010-01-01 10:01:01,2010-01-01 10:01:01" +
+                                     ",school_" + index + ":school_" + index + index + "$" + index)
               }
             } else {
               stringBuilder.append(index.toString + ",name_" + index

http://git-wip-us.apache.org/repos/asf/carbondata/blob/566217c7/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableWithRowParser.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableWithRowParser.scala b/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableWithRowParser.scala
new file mode 100644
index 0000000..a3df2be
--- /dev/null
+++ b/integration/spark2/src/test/scala/org/apache/spark/carbondata/TestStreamingTableWithRowParser.scala
@@ -0,0 +1,944 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.carbondata
+
+import java.io.{File, PrintWriter}
+import java.math.BigDecimal
+import java.net.{BindException, ServerSocket}
+import java.sql.{Date, Timestamp}
+
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.hive.CarbonRelation
+import org.apache.spark.sql.{CarbonEnv, Row, SparkSession}
+import org.apache.spark.sql.streaming.{ProcessingTime, StreamingQuery}
+import org.apache.spark.sql.test.util.QueryTest
+import org.scalatest.BeforeAndAfterAll
+
+import org.apache.carbondata.core.constants.CarbonCommonConstants
+import org.apache.carbondata.core.statusmanager.{FileFormat, SegmentStatus}
+import org.apache.carbondata.core.util.CarbonProperties
+import org.apache.carbondata.core.util.path.{CarbonStorePath, CarbonTablePath}
+import org.apache.carbondata.streaming.parser.CarbonStreamParser
+
+case class FileElement(school: Array[String], age: Integer)
+case class StreamData(id: Integer, name: String, city: String, salary: java.lang.Float,
+    tax: BigDecimal, percent: java.lang.Double, birthday: String,
+    register: String, updated: String,
+    file: FileElement)
+
+class TestStreamingTableWithRowParser extends QueryTest with BeforeAndAfterAll {
+
+  private val spark = sqlContext.sparkSession
+  private val dataFilePath = s"$resourcesPath/streamSample.csv"
+
+  override def beforeAll {
+    CarbonProperties.getInstance().addProperty(
+      CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT,
+      CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT)
+    CarbonProperties.getInstance().addProperty(
+      CarbonCommonConstants.CARBON_DATE_FORMAT,
+      CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT)
+    sql("DROP DATABASE IF EXISTS streaming1 CASCADE")
+    sql("CREATE DATABASE streaming1")
+    sql("USE streaming1")
+
+    dropTable()
+
+    createTable(tableName = "stream_table_filter", streaming = true, withBatchLoad = true)
+
+    createTableWithComplexType(
+      tableName = "stream_table_filter_complex", streaming = true, withBatchLoad = true)
+  }
+
+  override def afterAll {
+    dropTable()
+    sql("USE default")
+    sql("DROP DATABASE IF EXISTS streaming1 CASCADE")
+  }
+
+  def dropTable(): Unit = {
+    sql("drop table if exists streaming1.stream_table_filter")
+    sql("drop table if exists streaming1.stream_table_filter_complex")
+  }
+
+  test("query on stream table with dictionary, sort_columns") {
+    executeStreamingIngest(
+      tableName = "stream_table_filter",
+      batchNums = 2,
+      rowNumsEachBatch = 25,
+      intervalOfSource = 5,
+      intervalOfIngest = 5,
+      continueSeconds = 20,
+      generateBadRecords = true,
+      badRecordAction = "force",
+      autoHandoff = false
+    )
+
+    // non-filter
+    val result = sql("select * from streaming1.stream_table_filter order by id, name").collect()
+    assert(result != null)
+    assert(result.length == 55)
+    // check one row of streaming data
+    assert(result(1).isNullAt(0))
+    assert(result(1).getString(1) == "name_6")
+    // check one row of batch loading
+    assert(result(50).getInt(0) == 100000001)
+    assert(result(50).getString(1) == "batch_1")
+
+    // filter
+    checkAnswer(
+      sql("select * from stream_table_filter where id = 1"),
+      Seq(Row(1, "name_1", "city_1", 10000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id > 49 and id < 100000002"),
+      Seq(Row(50, "name_50", "city_50", 500000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0")),
+        Row(100000001, "batch_1", "city_1", 0.1, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id between 50 and 100000001"),
+      Seq(Row(50, "name_50", "city_50", 500000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0")),
+        Row(100000001, "batch_1", "city_1", 0.1, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where name in ('name_9','name_10', 'name_11', 'name_12') and id <> 10 and id not in (11, 12)"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where name = 'name_3'"),
+      Seq(Row(3, "name_3", "city_3", 30000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where name like '%me_3%' and id < 30"),
+      Seq(Row(3, "name_3", "city_3", 30000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(sql("select count(*) from stream_table_filter where name like '%ame%'"),
+      Seq(Row(49)))
+
+    checkAnswer(sql("select count(*) from stream_table_filter where name like '%batch%'"),
+      Seq(Row(5)))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where name >= 'name_3' and id < 4"),
+      Seq(Row(3, "name_3", "city_3", 30000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id in (9, 10, 11, 12) and name <> 'name_10' and name not in ('name_11', 'name_12')"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where city = 'city_1'"),
+      Seq(Row(1, "name_1", "city_1", 10000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0")),
+        Row(100000001, "batch_1", "city_1", 0.1, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where city like '%ty_1%' and ( id < 10 or id >= 100000001)"),
+      Seq(Row(1, "name_1", "city_1", 10000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0")),
+        Row(100000001, "batch_1", "city_1", 0.1, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(sql("select count(*) from stream_table_filter where city like '%city%'"),
+      Seq(Row(54)))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where city > 'city_09' and city < 'city_10'"),
+      Seq(Row(1, "name_1", "city_1", 10000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0")),
+        Row(100000001, "batch_1", "city_1", 0.1, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where city between 'city_09' and 'city_1'"),
+      Seq(Row(1, "name_1", "city_1", 10000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0")),
+        Row(100000001, "batch_1", "city_1", 0.1, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id in (9, 10, 11, 12) and city <> 'city_10' and city not in ('city_11', 'city_12')"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where salary = 90000"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where salary > 80000 and salary <= 100000"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0")),
+        Row(10, "name_10", "city_10", 100000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where salary between 80001 and 90000"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id in (9, 10, 11, 12) and salary <> 100000.0 and salary not in (110000.0, 120000.0)"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where tax = 0.04 and id < 100"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where tax >= 0.04 and id < 100"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where tax < 0.05 and tax > 0.02 and id < 100"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where tax between 0.02 and 0.04 and id < 100"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id in (9, 10) and tax <> 0.01"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where percent = 80.04 and id < 100"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where percent >= 80.04 and id < 100"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where percent < 80.05 and percent > 80.02 and id < 100"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where percent between 80.02 and 80.05 and id < 100"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id in (9, 10) and percent <> 80.01"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where birthday between '1990-01-04' and '1990-01-05'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0")),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0")),
+        Row(100000005, "batch_5", "city_5", 0.5, BigDecimal.valueOf(0.05), 80.05, Date.valueOf("1990-01-05"), Timestamp.valueOf("2010-01-05 10:01:01.0"), Timestamp.valueOf("2010-01-05 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where birthday = '1990-01-04'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0")),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where birthday > '1990-01-03' and birthday <= '1990-01-04'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0")),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where birthday between '1990-01-04' and '1990-01-05'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0")),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0")),
+        Row(100000005, "batch_5", "city_5", 0.5, BigDecimal.valueOf(0.05), 80.05, Date.valueOf("1990-01-05"), Timestamp.valueOf("2010-01-05 10:01:01.0"), Timestamp.valueOf("2010-01-05 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id in (9, 10) and birthday <> '1990-01-01'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where register = '2010-01-04 10:01:01'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0")),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where register > '2010-01-03 10:01:01' and register <= '2010-01-04 10:01:01'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0")),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where register between '2010-01-04 10:01:01' and '2010-01-05 10:01:01'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0")),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0")),
+        Row(100000005, "batch_5", "city_5", 0.5, BigDecimal.valueOf(0.05), 80.05, Date.valueOf("1990-01-05"), Timestamp.valueOf("2010-01-05 10:01:01.0"), Timestamp.valueOf("2010-01-05 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id in (9, 10) and register <> '2010-01-01 10:01:01'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where updated = '2010-01-04 10:01:01'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0")),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where updated > '2010-01-03 10:01:01' and register <= '2010-01-04 10:01:01'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0")),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where updated between '2010-01-04 10:01:01' and '2010-01-05 10:01:01'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0")),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0")),
+        Row(100000005, "batch_5", "city_5", 0.5, BigDecimal.valueOf(0.05), 80.05, Date.valueOf("1990-01-05"), Timestamp.valueOf("2010-01-05 10:01:01.0"), Timestamp.valueOf("2010-01-05 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id in (9, 10) and updated <> '2010-01-01 10:01:01'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id is null order by name"),
+      Seq(Row(null, "", "", null, null, null, null, null, null),
+        Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where name = ''"),
+      Seq(Row(null, "", "", null, null, null, null, null, null)))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id is null and name <> ''"),
+      Seq(Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where city = ''"),
+      Seq(Row(null, "", "", null, null, null, null, null, null)))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id is null and city <> ''"),
+      Seq(Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where salary is null"),
+      Seq(Row(null, "", "", null, null, null, null, null, null)))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id is null and salary is not null"),
+      Seq(Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where tax is null"),
+      Seq(Row(null, "", "", null, null, null, null, null, null)))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id is null and tax is not null"),
+      Seq(Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where percent is null"),
+      Seq(Row(null, "", "", null, null, null, null, null, null)))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id is null and percent is not null"),
+      Seq(Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where birthday is null"),
+      Seq(Row(null, "", "", null, null, null, null, null, null)))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id is null and birthday is not null"),
+      Seq(Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where register is null"),
+      Seq(Row(null, "", "", null, null, null, null, null, null),
+        Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id is null and register is not null"),
+      Seq())
+
+    checkAnswer(
+      sql("select * from stream_table_filter where updated is null"),
+      Seq(Row(null, "", "", null, null, null, null, null, null)))
+
+    checkAnswer(
+      sql("select * from stream_table_filter where id is null and updated is not null"),
+      Seq(Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"))))
+
+    // agg
+    checkAnswer(
+      sql("select count(*), max(id), min(name), cast(avg(id) as integer), sum(id) " +
+          "from stream_table_filter where id >= 2 and id <= 100000004"),
+      Seq(Row(51, 100000004, "batch_1", 7843162, 400001276)))
+
+    checkAnswer(
+      sql("select city, count(id), sum(id), cast(avg(id) as integer), " +
+          "max(salary), min(salary) " +
+          "from stream_table_filter " +
+          "where name in ('batch_1', 'batch_2', 'batch_3', 'name_1', 'name_2', 'name_3') " +
+          "and city <> '' " +
+          "group by city " +
+          "order by city"),
+      Seq(Row("city_1", 2, 100000002, 50000001, 10000.0, 0.1),
+        Row("city_2", 1, 100000002, 100000002, 0.2, 0.2),
+        Row("city_3", 2, 100000006, 50000003, 30000.0, 0.3)))
+
+    // batch loading
+    for(_ <- 0 to 2) {
+      executeBatchLoad("stream_table_filter")
+    }
+    checkAnswer(
+      sql("select count(*) from streaming1.stream_table_filter"),
+      Seq(Row(25 * 2 + 5 + 5 * 3)))
+
+    sql("alter table streaming1.stream_table_filter compact 'minor'")
+    Thread.sleep(5000)
+    val result1 = sql("show segments for table streaming1.stream_table_filter").collect()
+    result1.foreach { row =>
+      if (row.getString(0).equals("1")) {
+        assertResult(SegmentStatus.STREAMING.getMessage)(row.getString(1))
+        assertResult(FileFormat.ROW_V1.toString)(row.getString(5))
+      } else if (row.getString(0).equals("0.1")) {
+        assertResult(SegmentStatus.SUCCESS.getMessage)(row.getString(1))
+        assertResult(FileFormat.COLUMNAR_V3.toString)(row.getString(5))
+      } else {
+        assertResult(SegmentStatus.COMPACTED.getMessage)(row.getString(1))
+        assertResult(FileFormat.COLUMNAR_V3.toString)(row.getString(5))
+      }
+    }
+
+  }
+
+  test("query on stream table with dictionary, sort_columns and complex column") {
+    executeStreamingIngest(
+      tableName = "stream_table_filter_complex",
+      batchNums = 2,
+      rowNumsEachBatch = 25,
+      intervalOfSource = 5,
+      intervalOfIngest = 5,
+      continueSeconds = 20,
+      generateBadRecords = true,
+      badRecordAction = "force",
+      autoHandoff = false
+    )
+
+    // non-filter
+    val result = sql("select * from streaming1.stream_table_filter_complex order by id, name").collect()
+    assert(result != null)
+    assert(result.length == 55)
+    // check one row of streaming data
+    assert(result(0).isNullAt(0))
+    assert(result(0).getString(1) == "")
+    assert(result(0).getStruct(9).isNullAt(1))
+    // check one row of batch loading
+    assert(result(50).getInt(0) == 100000001)
+    assert(result(50).getString(1) == "batch_1")
+    assert(result(50).getStruct(9).getInt(1) == 20)
+
+    // filter
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where id = 1"),
+      Seq(Row(1, "name_1", "city_1", 10000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_1", "school_11")), 1))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where id > 49 and id < 100000002"),
+      Seq(Row(50, "name_50", "city_50", 500000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_50", "school_5050")), 50)),
+        Row(100000001, "batch_1", "city_1", 0.1, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_1", "school_11")), 20))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where id between 50 and 100000001"),
+      Seq(Row(50, "name_50", "city_50", 500000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_50", "school_5050")), 50)),
+        Row(100000001, "batch_1", "city_1", 0.1, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_1", "school_11")), 20))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where name = 'name_3'"),
+      Seq(Row(3, "name_3", "city_3", 30000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_3", "school_33")), 3))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where name like '%me_3%' and id < 30"),
+      Seq(Row(3, "name_3", "city_3", 30000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_3", "school_33")), 3))))
+
+    checkAnswer(sql("select count(*) from stream_table_filter_complex where name like '%ame%'"),
+      Seq(Row(49)))
+
+    checkAnswer(sql("select count(*) from stream_table_filter_complex where name like '%batch%'"),
+      Seq(Row(5)))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where name >= 'name_3' and id < 4"),
+      Seq(Row(3, "name_3", "city_3", 30000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_3", "school_33")), 3))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where city = 'city_1'"),
+      Seq(Row(1, "name_1", "city_1", 10000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_1", "school_11")), 1)),
+        Row(100000001, "batch_1", "city_1", 0.1, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_1", "school_11")), 20))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where city like '%ty_1%' and ( id < 10 or id >= 100000001)"),
+      Seq(Row(1, "name_1", "city_1", 10000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_1", "school_11")), 1)),
+        Row(100000001, "batch_1", "city_1", 0.1, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_1", "school_11")), 20))))
+
+    checkAnswer(sql("select count(*) from stream_table_filter_complex where city like '%city%'"),
+      Seq(Row(54)))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where city > 'city_09' and city < 'city_10'"),
+      Seq(Row(1, "name_1", "city_1", 10000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_1", "school_11")), 1)),
+        Row(100000001, "batch_1", "city_1", 0.1, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_1", "school_11")), 20))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where city between 'city_09' and 'city_1'"),
+      Seq(Row(1, "name_1", "city_1", 10000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_1", "school_11")), 1)),
+        Row(100000001, "batch_1", "city_1", 0.1, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_1", "school_11")), 20))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where salary = 90000"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where salary > 80000 and salary <= 100000"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9)),
+        Row(10, "name_10", "city_10", 100000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_10", "school_1010")), 10))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where salary between 80001 and 90000"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where tax = 0.04 and id < 100"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where tax >= 0.04 and id < 100"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where tax < 0.05 and tax > 0.02 and id < 100"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where tax between 0.02 and 0.04 and id < 100"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where percent = 80.04 and id < 100"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where percent >= 80.04 and id < 100"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where percent < 80.05 and percent > 80.02 and id < 100"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where percent between 80.02 and 80.05 and id < 100"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where birthday between '1990-01-04' and '1990-01-05'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9)),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_4", "school_44")), 50)),
+        Row(100000005, "batch_5", "city_5", 0.5, BigDecimal.valueOf(0.05), 80.05, Date.valueOf("1990-01-05"), Timestamp.valueOf("2010-01-05 10:01:01.0"), Timestamp.valueOf("2010-01-05 10:01:01.0"), Row(wrap(Array("school_5", "school_55")), 60))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where birthday = '1990-01-04'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9)),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_4", "school_44")), 50))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where birthday > '1990-01-03' and birthday <= '1990-01-04'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9)),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_4", "school_44")), 50))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where birthday between '1990-01-04' and '1990-01-05'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9)),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_4", "school_44")), 50)),
+        Row(100000005, "batch_5", "city_5", 0.5, BigDecimal.valueOf(0.05), 80.05, Date.valueOf("1990-01-05"), Timestamp.valueOf("2010-01-05 10:01:01.0"), Timestamp.valueOf("2010-01-05 10:01:01.0"), Row(wrap(Array("school_5", "school_55")), 60))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where register = '2010-01-04 10:01:01'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9)),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_4", "school_44")), 50))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where register > '2010-01-03 10:01:01' and register <= '2010-01-04 10:01:01'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9)),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_4", "school_44")), 50))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where register between '2010-01-04 10:01:01' and '2010-01-05 10:01:01'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9)),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_4", "school_44")),50)),
+        Row(100000005, "batch_5", "city_5", 0.5, BigDecimal.valueOf(0.05), 80.05, Date.valueOf("1990-01-05"), Timestamp.valueOf("2010-01-05 10:01:01.0"), Timestamp.valueOf("2010-01-05 10:01:01.0"), Row(wrap(Array("school_5", "school_55")), 60))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where updated = '2010-01-04 10:01:01'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9)),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_4", "school_44")), 50))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where updated > '2010-01-03 10:01:01' and register <= '2010-01-04 10:01:01'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9)),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_4", "school_44")), 50))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where updated between '2010-01-04 10:01:01' and '2010-01-05 10:01:01'"),
+      Seq(Row(9, "name_9", "city_9", 90000.0, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_9", "school_99")), 9)),
+        Row(100000004, "batch_4", "city_4", 0.4, BigDecimal.valueOf(0.04), 80.04, Date.valueOf("1990-01-04"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Timestamp.valueOf("2010-01-04 10:01:01.0"), Row(wrap(Array("school_4", "school_44")), 50)),
+        Row(100000005, "batch_5", "city_5", 0.5, BigDecimal.valueOf(0.05), 80.05, Date.valueOf("1990-01-05"), Timestamp.valueOf("2010-01-05 10:01:01.0"), Timestamp.valueOf("2010-01-05 10:01:01.0"), Row(wrap(Array("school_5", "school_55")), 60))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where id is null order by name"),
+      Seq(Row(null, "", "", null, null, null, null, null, null, Row(wrap(Array(null, null)), null)),
+        Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_6", "school_66")), 6))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where name = ''"),
+      Seq(Row(null, "", "", null, null, null, null, null, null, Row(wrap(Array(null, null)), null))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where id is null and name <> ''"),
+      Seq(Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_6", "school_66")), 6))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where city = ''"),
+      Seq(Row(null, "", "", null, null, null, null, null, null, Row(wrap(Array(null, null)), null))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where id is null and city <> ''"),
+      Seq(Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_6", "school_66")), 6))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where salary is null"),
+      Seq(Row(null, "", "", null, null, null, null, null, null, Row(wrap(Array(null, null)), null))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where id is null and salary is not null"),
+      Seq(Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_6", "school_66")), 6))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where tax is null"),
+      Seq(Row(null, "", "", null, null, null, null, null, null, Row(wrap(Array(null, null)), null))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where id is null and tax is not null"),
+      Seq(Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_6", "school_66")), 6))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where percent is null"),
+      Seq(Row(null, "", "", null, null, null, null, null, null, Row(wrap(Array(null, null)), null))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where id is null and salary is not null"),
+      Seq(Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_6", "school_66")), 6))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where birthday is null"),
+      Seq(Row(null, "", "", null, null, null, null, null, null, Row(wrap(Array(null, null)), null))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where id is null and birthday is not null"),
+      Seq(Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_6", "school_66")), 6))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where register is null"),
+      Seq(Row(null, "", "", null, null, null, null, null, null, Row(wrap(Array(null, null)), null)),
+        Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_6", "school_66")), 6))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where id is null and register is not null"),
+      Seq())
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where updated is null"),
+      Seq(Row(null, "", "", null, null, null, null, null, null, Row(wrap(Array(null, null)), null))))
+
+    checkAnswer(
+      sql("select * from stream_table_filter_complex where id is null and updated is not null"),
+      Seq(Row(null, "name_6", "city_6", 60000.0, BigDecimal.valueOf(0.01), 80.01, Date.valueOf("1990-01-01"), null, Timestamp.valueOf("2010-01-01 10:01:01.0"), Row(wrap(Array("school_6", "school_66")), 6))))
+
+    // agg
+    checkAnswer(
+      sql("select count(*), max(id), min(name), cast(avg(file.age) as integer), sum(file.age) " +
+          "from stream_table_filter_complex where id >= 2 and id <= 100000004"),
+      Seq(Row(51, 100000004, "batch_1", 27, 1406)))
+
+    checkAnswer(
+      sql("select city, count(id), sum(id), cast(avg(file.age) as integer), " +
+          "max(salary), min(salary) " +
+          "from stream_table_filter_complex " +
+          "where name in ('batch_1', 'batch_2', 'batch_3', 'name_1', 'name_2', 'name_3') " +
+          "and city <> '' " +
+          "group by city " +
+          "order by city"),
+      Seq(Row("city_1", 2, 100000002, 10, 10000.0, 0.1),
+        Row("city_2", 1, 100000002, 30, 0.2, 0.2),
+        Row("city_3", 2, 100000006, 21, 30000.0, 0.3)))
+  }
+
+  def createWriteSocketThread(
+      serverSocket: ServerSocket,
+      writeNums: Int,
+      rowNums: Int,
+      intervalSecond: Int,
+      badRecords: Boolean = false): Thread = {
+    new Thread() {
+      override def run(): Unit = {
+        // wait for client to connection request and accept
+        val clientSocket = serverSocket.accept()
+        val socketWriter = new PrintWriter(clientSocket.getOutputStream())
+        var index = 0
+        for (_ <- 1 to writeNums) {
+          // write 5 records per iteration
+          val stringBuilder = new StringBuilder()
+          for (_ <- 1 to rowNums) {
+            index = index + 1
+            if (badRecords) {
+              if (index == 2) {
+                // null value
+                stringBuilder.append(",,,,,,,,,")
+              } else if (index == 6) {
+                // illegal number
+                stringBuilder.append(index.toString + "abc,name_" + index
+                                     + ",city_" + index + "," + (10000.00 * index).toString + ",0.01,80.01" +
+                                     ",1990-01-01,2010-01-0110:01:01,2010-01-01 10:01:01" +
+                                     ",school_" + index + ":school_" + index + index + "$" + index)
+              } else if (index == 9) {
+                stringBuilder.append(index.toString + ",name_" + index
+                                     + ",city_" + index + "," + (10000.00 * index).toString + ",0.04,80.04" +
+                                     ",1990-01-04,2010-01-04 10:01:01,2010-01-04 10:01:01" +
+                                     ",school_" + index + ":school_" + index + index + "$" + index)
+              } else {
+                stringBuilder.append(index.toString + ",name_" + index
+                                     + ",city_" + index + "," + (10000.00 * index).toString + ",0.01,80.01" +
+                                     ",1990-01-01,2010-01-01 10:01:01,2010-01-01 10:01:01" +
+                                     ",school_" + index + ":school_" + index + index + "$" + index)
+              }
+            } else {
+              stringBuilder.append(index.toString + ",name_" + index
+                                   + ",city_" + index + "," + (10000.00 * index).toString + ",0.01,80.01" +
+                                   ",1990-01-01,2010-01-01 10:01:01,2010-01-01 10:01:01" +
+                                   ",school_" + index + ":school_" + index + index + "$" + index)
+            }
+            stringBuilder.append("\n")
+          }
+          socketWriter.append(stringBuilder.toString())
+          socketWriter.flush()
+          Thread.sleep(1000 * intervalSecond)
+        }
+        socketWriter.close()
+      }
+    }
+  }
+
+  def createSocketStreamingThread(
+      spark: SparkSession,
+      port: Int,
+      tablePath: CarbonTablePath,
+      tableIdentifier: TableIdentifier,
+      badRecordAction: String = "force",
+      intervalSecond: Int = 2,
+      handoffSize: Long = CarbonCommonConstants.HANDOFF_SIZE_DEFAULT,
+      autoHandoff: Boolean = CarbonCommonConstants.ENABLE_AUTO_HANDOFF_DEFAULT.toBoolean
+  ): Thread = {
+    new Thread() {
+      override def run(): Unit = {
+        var qry: StreamingQuery = null
+        try {
+          import spark.implicits._
+          val readSocketDF = spark.readStream
+            .format("socket")
+            .option("host", "localhost")
+            .option("port", port)
+            .load()
+            .as[String]
+            .map(_.split(","))
+            .map { fields => {
+              if (fields.length == 0) {
+                StreamData(null, "", "", null, null, null, null, null, null, null)
+              } else {
+                val tmp = fields(9).split("\\$")
+                val file = FileElement(tmp(0).split(":"), tmp(1).toInt)
+                if (fields(1).equals("name_6")) {
+                  StreamData(null, fields(1), fields(2), fields(3).toFloat,
+                      BigDecimal.valueOf(fields(4).toDouble), fields(5).toDouble,
+                      fields(6), fields(7), fields(8), file)
+                } else {
+                  StreamData(fields(0).toInt, fields(1), fields(2), fields(3).toFloat,
+                      BigDecimal.valueOf(fields(4).toDouble), fields(5).toDouble,
+                      fields(6), fields(7), fields(8), file)
+                }
+              }
+            } }
+
+          // Write data from socket stream to carbondata file
+          qry = readSocketDF.writeStream
+            .format("carbondata")
+            .trigger(ProcessingTime(s"$intervalSecond seconds"))
+            .option("checkpointLocation", tablePath.getStreamingCheckpointDir)
+            .option("bad_records_action", badRecordAction)
+            .option("dbName", tableIdentifier.database.get)
+            .option("tableName", tableIdentifier.table)
+            .option(CarbonCommonConstants.HANDOFF_SIZE, handoffSize)
+            .option("timestampformat", CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT)
+            .option(CarbonCommonConstants.ENABLE_AUTO_HANDOFF, autoHandoff)
+            .option(CarbonStreamParser.CARBON_STREAM_PARSER,
+              "org.apache.carbondata.streaming.parser.RowStreamParserImp")
+            .start()
+          qry.awaitTermination()
+        } catch {
+          case ex =>
+            throw new Exception(ex.getMessage)
+        } finally {
+          if (null != qry) {
+            qry.stop()
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * start ingestion thread: write `rowNumsEachBatch` rows repeatly for `batchNums` times.
+   */
+  def executeStreamingIngest(
+      tableName: String,
+      batchNums: Int,
+      rowNumsEachBatch: Int,
+      intervalOfSource: Int,
+      intervalOfIngest: Int,
+      continueSeconds: Int,
+      generateBadRecords: Boolean,
+      badRecordAction: String,
+      handoffSize: Long = CarbonCommonConstants.HANDOFF_SIZE_DEFAULT,
+      autoHandoff: Boolean = CarbonCommonConstants.ENABLE_AUTO_HANDOFF_DEFAULT.toBoolean
+  ): Unit = {
+    val identifier = new TableIdentifier(tableName, Option("streaming1"))
+    val carbonTable = CarbonEnv.getInstance(spark).carbonMetastore.lookupRelation(identifier)(spark)
+      .asInstanceOf[CarbonRelation].metaData.carbonTable
+    val tablePath = CarbonStorePath.getCarbonTablePath(carbonTable.getAbsoluteTableIdentifier)
+    var server: ServerSocket = null
+    try {
+      server = getServerSocket()
+      val thread1 = createWriteSocketThread(
+        serverSocket = server,
+        writeNums = batchNums,
+        rowNums = rowNumsEachBatch,
+        intervalSecond = intervalOfSource,
+        badRecords = generateBadRecords)
+      val thread2 = createSocketStreamingThread(
+        spark = spark,
+        port = server.getLocalPort,
+        tablePath = tablePath,
+        tableIdentifier = identifier,
+        badRecordAction = badRecordAction,
+        intervalSecond = intervalOfIngest,
+        handoffSize = handoffSize,
+        autoHandoff = autoHandoff)
+      thread1.start()
+      thread2.start()
+      Thread.sleep(continueSeconds * 1000)
+      thread2.interrupt()
+      thread1.interrupt()
+    } finally {
+      if (null != server) {
+        server.close()
+      }
+    }
+  }
+
+  def createTable(tableName: String, streaming: Boolean, withBatchLoad: Boolean): Unit = {
+    sql(
+      s"""
+         | CREATE TABLE streaming1.$tableName(
+         | id INT,
+         | name STRING,
+         | city STRING,
+         | salary FLOAT,
+         | tax DECIMAL(8,2),
+         | percent double,
+         | birthday DATE,
+         | register TIMESTAMP,
+         | updated TIMESTAMP
+         | )
+         | STORED BY 'carbondata'
+         | TBLPROPERTIES(${if (streaming) "'streaming'='true', " else "" }
+         | 'sort_columns'='name', 'dictionary_include'='city,register')
+         | """.stripMargin)
+
+    if (withBatchLoad) {
+      // batch loading 5 rows
+      executeBatchLoad(tableName)
+    }
+  }
+
+  def createTableWithComplexType(
+      tableName: String,
+      streaming: Boolean,
+      withBatchLoad: Boolean): Unit = {
+    sql(
+      s"""
+         | CREATE TABLE streaming1.$tableName(
+         | id INT,
+         | name STRING,
+         | city STRING,
+         | salary FLOAT,
+         | tax DECIMAL(8,2),
+         | percent double,
+         | birthday DATE,
+         | register TIMESTAMP,
+         | updated TIMESTAMP,
+         | file struct<school:array<string>, age:int>
+         | )
+         | STORED BY 'carbondata'
+         | TBLPROPERTIES(${if (streaming) "'streaming'='true', " else "" }
+         | 'sort_columns'='name', 'dictionary_include'='id,name,salary,tax,percent,updated')
+         | """.stripMargin)
+
+    if (withBatchLoad) {
+      // batch loading 5 rows
+      executeBatchLoad(tableName)
+    }
+  }
+
+  def executeBatchLoad(tableName: String): Unit = {
+    sql(
+      s"""
+         | LOAD DATA LOCAL INPATH '$dataFilePath'
+         | INTO TABLE streaming1.$tableName
+         | OPTIONS('HEADER'='true')
+         """.stripMargin)
+  }
+
+  def wrap(array: Array[String]) = {
+    new mutable.WrappedArray.ofRef(array)
+  }
+
+  /**
+   * get a ServerSocket
+   * if the address was already used, it will retry to use new port number.
+   *
+   * @return ServerSocket
+   */
+  def getServerSocket(): ServerSocket = {
+    var port = 7071
+    var serverSocket: ServerSocket = null
+    var retry = false
+    do {
+      try {
+        retry = false
+        serverSocket = new ServerSocket(port)
+      } catch {
+        case ex: BindException =>
+          retry = true
+          port = port + 2
+          if (port >= 65535) {
+            throw ex
+          }
+      }
+    } while (retry)
+    serverSocket
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/carbondata/blob/566217c7/streaming/src/main/java/org/apache/carbondata/streaming/parser/CSVStreamParserImp.java
----------------------------------------------------------------------
diff --git a/streaming/src/main/java/org/apache/carbondata/streaming/parser/CSVStreamParserImp.java b/streaming/src/main/java/org/apache/carbondata/streaming/parser/CSVStreamParserImp.java
index eed3fd5..00d06b6 100644
--- a/streaming/src/main/java/org/apache/carbondata/streaming/parser/CSVStreamParserImp.java
+++ b/streaming/src/main/java/org/apache/carbondata/streaming/parser/CSVStreamParserImp.java
@@ -23,6 +23,7 @@ import com.univocity.parsers.csv.CsvParser;
 import com.univocity.parsers.csv.CsvParserSettings;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.types.StructType;
 
 /**
  * CSV Stream Parser, it is also the default parser.
@@ -31,7 +32,7 @@ public class CSVStreamParserImp implements CarbonStreamParser {
 
   private CsvParser csvParser;
 
-  @Override public void initialize(Configuration configuration) {
+  @Override public void initialize(Configuration configuration, StructType structType) {
     CsvParserSettings settings = CSVInputFormat.extractCsvParserSettings(configuration);
     csvParser = new CsvParser(settings);
   }

http://git-wip-us.apache.org/repos/asf/carbondata/blob/566217c7/streaming/src/main/java/org/apache/carbondata/streaming/parser/CarbonStreamParser.java
----------------------------------------------------------------------
diff --git a/streaming/src/main/java/org/apache/carbondata/streaming/parser/CarbonStreamParser.java b/streaming/src/main/java/org/apache/carbondata/streaming/parser/CarbonStreamParser.java
index a3b5592..643758c 100644
--- a/streaming/src/main/java/org/apache/carbondata/streaming/parser/CarbonStreamParser.java
+++ b/streaming/src/main/java/org/apache/carbondata/streaming/parser/CarbonStreamParser.java
@@ -19,6 +19,7 @@ package org.apache.carbondata.streaming.parser;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.types.StructType;
 
 /**
  * Stream parser interface
@@ -27,9 +28,10 @@ public interface CarbonStreamParser {
 
   String CARBON_STREAM_PARSER = "carbon.stream.parser";
 
-  String CARBON_STREAM_PARSER_DEFAULT = "org.apache.carbondata.streaming.parser.CSVStreamParserImp";
+  String CARBON_STREAM_PARSER_DEFAULT =
+      "org.apache.carbondata.streaming.parser.CSVStreamParserImp";
 
-  void initialize(Configuration configuration);
+  void initialize(Configuration configuration, StructType structType);
 
   Object[] parserRow(InternalRow value);
 

http://git-wip-us.apache.org/repos/asf/carbondata/blob/566217c7/streaming/src/main/scala/org/apache/carbondata/streaming/parser/RowStreamParserImp.scala
----------------------------------------------------------------------
diff --git a/streaming/src/main/scala/org/apache/carbondata/streaming/parser/RowStreamParserImp.scala b/streaming/src/main/scala/org/apache/carbondata/streaming/parser/RowStreamParserImp.scala
new file mode 100644
index 0000000..5a227cf
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/carbondata/streaming/parser/RowStreamParserImp.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.streaming.parser
+
+import java.text.SimpleDateFormat
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.StructType
+
+import org.apache.carbondata.core.constants.CarbonCommonConstants
+import org.apache.carbondata.processing.loading.constants.DataLoadProcessorConstants
+import org.apache.carbondata.spark.util.CarbonScalaUtil
+
+/**
+ * SparkSQL Row Stream Parser.
+ */
+class RowStreamParserImp extends CarbonStreamParser {
+
+  var configuration: Configuration = null
+  var structType: StructType = null
+  var encoder: ExpressionEncoder[Row] = null
+
+  var timeStampFormat: SimpleDateFormat = null
+  var dateFormat: SimpleDateFormat = null
+  var complexDelimiterLevel1: String = null
+  var complexDelimiterLevel2: String = null
+  var serializationNullFormat: String = null
+
+  override def initialize(configuration: Configuration, structType: StructType): Unit = {
+    this.configuration = configuration
+    this.structType = structType
+    this.encoder = RowEncoder.apply(this.structType).resolveAndBind()
+
+    this.timeStampFormat = new SimpleDateFormat(
+      this.configuration.get(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT))
+    this.dateFormat = new SimpleDateFormat(
+      this.configuration.get(CarbonCommonConstants.CARBON_DATE_FORMAT))
+    this.complexDelimiterLevel1 = this.configuration.get("carbon_complex_delimiter_level_1")
+    this.complexDelimiterLevel2 = this.configuration.get("carbon_complex_delimiter_level_2")
+    this.serializationNullFormat =
+      this.configuration.get(DataLoadProcessorConstants.SERIALIZATION_NULL_FORMAT)
+  }
+
+  override def parserRow(value: InternalRow): Array[Object] = {
+    this.encoder.fromRow(value).toSeq.map { x => {
+      CarbonScalaUtil.getString(x,
+        serializationNullFormat, complexDelimiterLevel1, complexDelimiterLevel2,
+        timeStampFormat, dateFormat)
+    } }.toArray
+  }
+
+  override def close(): Unit = {
+  }
+}

http://git-wip-us.apache.org/repos/asf/carbondata/blob/566217c7/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala
----------------------------------------------------------------------
diff --git a/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala b/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala
index 67d8a4d..f2f9853 100644
--- a/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala
+++ b/streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala
@@ -30,6 +30,7 @@ import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage
 import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.{QueryExecution, SQLExecution}
+import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.{SerializableConfiguration, Utils}
 
 import org.apache.carbondata.common.CarbonIterator
@@ -44,6 +45,7 @@ import org.apache.carbondata.core.util.path.CarbonStorePath
 import org.apache.carbondata.events.{OperationContext, OperationListenerBus}
 import org.apache.carbondata.hadoop.streaming.CarbonStreamOutputFormat
 import org.apache.carbondata.hadoop.util.CarbonInputFormatUtil
+import org.apache.carbondata.processing.loading.constants.DataLoadProcessorConstants
 import org.apache.carbondata.processing.loading.events.LoadEvents.{LoadTablePostExecutionEvent, LoadTablePreExecutionEvent}
 import org.apache.carbondata.processing.loading.model.CarbonLoadModel
 import org.apache.carbondata.streaming.{CarbonStreamException, StreamHandoffRDD}
@@ -73,6 +75,20 @@ class CarbonAppendableStreamSink(
     parameters.foreach { entry =>
       conf.set(entry._1, entry._2)
     }
+    // properties below will be used for default CarbonStreamParser
+    conf.set("carbon_complex_delimiter_level_1",
+      carbonLoadModel.getComplexDelimiterLevel1)
+    conf.set("carbon_complex_delimiter_level_2",
+      carbonLoadModel.getComplexDelimiterLevel2)
+    conf.set(
+      DataLoadProcessorConstants.SERIALIZATION_NULL_FORMAT,
+      carbonLoadModel.getSerializationNullFormat().split(",")(1))
+    conf.set(
+      CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT,
+      carbonLoadModel.getTimestampformat())
+    conf.set(
+      CarbonCommonConstants.CARBON_DATE_FORMAT,
+      carbonLoadModel.getDateFormat())
     conf
   }
   // segment max size(byte)
@@ -223,6 +239,7 @@ object CarbonAppendableStreamSink {
           server.get.initializeDictionaryGenerator(carbonTable)
         }
 
+        val rowSchema = queryExecution.analyzed.schema
         // write data file
         result = sparkSession.sparkContext.runJob(queryExecution.toRdd,
           (taskContext: TaskContext, iterator: Iterator[InternalRow]) => {
@@ -233,7 +250,8 @@ object CarbonAppendableStreamSink {
               sparkPartitionId = taskContext.partitionId(),
               sparkAttemptNumber = taskContext.attemptNumber(),
               committer,
-              iterator
+              iterator,
+              rowSchema
             )
           })
 
@@ -280,7 +298,8 @@ object CarbonAppendableStreamSink {
       sparkPartitionId: Int,
       sparkAttemptNumber: Int,
       committer: FileCommitProtocol,
-      iterator: Iterator[InternalRow]
+      iterator: Iterator[InternalRow],
+      rowSchema: StructType
   ): TaskCommitMessage = {
 
     val jobId = CarbonInputFormatUtil.getJobId(new Date, sparkStageId)
@@ -311,7 +330,7 @@ object CarbonAppendableStreamSink {
 
         val streamParser =
           Class.forName(parserName).newInstance.asInstanceOf[CarbonStreamParser]
-        streamParser.initialize(taskAttemptContext.getConfiguration)
+        streamParser.initialize(taskAttemptContext.getConfiguration, rowSchema)
 
         StreamSegment.appendBatchData(new InputIterator(iterator, streamParser),
           taskAttemptContext, carbonLoadModel)