You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@carbondata.apache.org by gv...@apache.org on 2017/03/20 12:43:19 UTC

[1/2] incubator-carbondata git commit: When data contains long max and min values for a measure column with bigInt datatype, the delta compression selected is DATA_BYTE which is incorrect. For selecting the delta compression min value is decremented from

Repository: incubator-carbondata
Updated Branches:
  refs/heads/master 8d0a672b9 -> e3fd98bbf


When data contains long max and min values for a measure column with bigInt datatype, the delta compression selected is DATA_BYTE which is incorrect. For selecting the delta compression min value is decremented from max value and here the min value is negative, so it performs addition operation and goes out of long range. When a long value goes out of range it starts again from the long max negative value which results in wrong compression selection.
This leads to data loss and incorrect query results.


Project: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/commit/4f915d10
Tree: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/tree/4f915d10
Diff: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/diff/4f915d10

Branch: refs/heads/master
Commit: 4f915d102a5f6186aed5a74cfe23a0a81ea62aee
Parents: 8d0a672
Author: manishgupta88 <to...@gmail.com>
Authored: Mon Mar 20 17:13:02 2017 +0530
Committer: manishgupta88 <to...@gmail.com>
Committed: Mon Mar 20 17:13:21 2017 +0530

----------------------------------------------------------------------
 .../carbondata/core/util/ValueCompressionUtil.java  | 12 ++++++++++--
 .../test/resources/testBigInt_boundary_value.csv    | 16 ++++++++++++++++
 .../spark/testsuite/bigdecimal/TestBigInt.scala     | 13 +++++++++++++
 3 files changed, 39 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/4f915d10/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java b/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java
index 6a14373..c8a9397 100644
--- a/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java
+++ b/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java
@@ -181,8 +181,16 @@ public final class ValueCompressionUtil {
       int mantissa, byte dataTypeSelected, char measureStoreType) {
     DataType adaptiveDataType = getDataType((long) maxValue, mantissa, dataTypeSelected);
     int adaptiveSize = getSize(adaptiveDataType);
-    DataType deltaDataType = getDataType((long) maxValue - (long) minValue, mantissa,
-        dataTypeSelected);
+    DataType deltaDataType = null;
+    // we cannot apply compression in case actual data type of the column is long
+    // consider the scenario when max and min value are equal to is long max and min value OR
+    // when the max and min value are resulting in a value greater than long max value, then
+    // it is not possible to determine the compression type.
+    if (adaptiveDataType == DataType.DATA_LONG) {
+      deltaDataType = DataType.DATA_BIGINT;
+    } else {
+      deltaDataType = getDataType((long) maxValue - (long) minValue, mantissa, dataTypeSelected);
+    }
     int deltaSize = getSize(deltaDataType);
     if (adaptiveSize > deltaSize) {
       return new CompressionFinder(COMPRESSION_TYPE.DELTA_DOUBLE, DataType.DATA_BIGINT,

http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/4f915d10/integration/spark-common-test/src/test/resources/testBigInt_boundary_value.csv
----------------------------------------------------------------------
diff --git a/integration/spark-common-test/src/test/resources/testBigInt_boundary_value.csv b/integration/spark-common-test/src/test/resources/testBigInt_boundary_value.csv
new file mode 100644
index 0000000..80ec0b4
--- /dev/null
+++ b/integration/spark-common-test/src/test/resources/testBigInt_boundary_value.csv
@@ -0,0 +1,16 @@
+Invalid values,92233720368547758071234
+Invalid values,def
+All_null_values,null
+All_zeros_values,0
+Max_range_values,9223372036854775807
+Max_range_values,9223372036854775807
+Max_range_values,9223372036854775807
+Max_range_values,9223372036854775807
+Min_range_values,-9223372036854775808
+Min_range_values,-9223372036854775807
+Min_range_values,-9223372036854775806
+Min_range_values,-9223372036854775805
+Normal_values,2345
+Normal_values,1234
+Normal_values,3456
+Normal_values,4567

http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/4f915d10/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala
----------------------------------------------------------------------
diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala
index 59fc9d3..f738040 100644
--- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala
+++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala
@@ -79,6 +79,19 @@ class TestBigInt extends QueryTest with BeforeAndAfterAll {
       sql("select count(distinct salary) from hiveTable"))
   }
 
+  test("test big int data type storage for boundary values") {
+    sql("DROP TABLE IF EXISTS Test_Boundary_carbon")
+    sql("DROP TABLE IF EXISTS Test_Boundary_hive")
+    sql("create table Test_Boundary_carbon (c1_String string, c2_Bigint Bigint) STORED BY 'org.apache.carbondata.format'")
+    sql(s"LOAD DATA LOCAL INPATH '$resourcesPath/testBigInt_boundary_value.csv' into table Test_Boundary_carbon OPTIONS('FILEHEADER'='c1_String,c2_Bigint')")
+    sql("create table Test_Boundary_hive (c1_String string, c2_Bigint Bigint) row format delimited fields terminated by ','")
+    sql(s"LOAD DATA LOCAL INPATH '$resourcesPath/testBigInt_boundary_value.csv' into table Test_Boundary_hive")
+    checkAnswer(sql("select c2_bigInt*23 from Test_Boundary_carbon where c2_BigInt<9223372036854775807"),
+      sql("select c2_bigInt*23 from Test_Boundary_hive where c2_BigInt<9223372036854775807"))
+    sql("DROP TABLE IF EXISTS Test_Boundary_carbon")
+    sql("DROP TABLE IF EXISTS Test_Boundary_hive")
+  }
+
   override def afterAll {
     sql("drop table if exists carbonTable")
     sql("drop table if exists hiveTable")

[2/2] incubator-carbondata git commit: [CARBONDATA-797] Data loss for BigInt datatype if data contains long max and min values.This closes #676

Posted by gv...@apache.org.

[CARBONDATA-797] Data loss for BigInt datatype if data contains long max and min values.This closes #676


Project: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/commit/e3fd98bb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/tree/e3fd98bb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/diff/e3fd98bb

Branch: refs/heads/master
Commit: e3fd98bbf9ee101425fa18fa43d0f32b0fa1825a
Parents: 8d0a672 4f915d1
Author: Venkata Ramana G <ra...@huawei.com>
Authored: Mon Mar 20 18:03:30 2017 +0530
Committer: Venkata Ramana G <ra...@huawei.com>
Committed: Mon Mar 20 18:03:30 2017 +0530

----------------------------------------------------------------------
 .../carbondata/core/util/ValueCompressionUtil.java  | 12 ++++++++++--
 .../test/resources/testBigInt_boundary_value.csv    | 16 ++++++++++++++++
 .../spark/testsuite/bigdecimal/TestBigInt.scala     | 13 +++++++++++++
 3 files changed, 39 insertions(+), 2 deletions(-)
----------------------------------------------------------------------