You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@carbondata.apache.org by ja...@apache.org on 2017/03/16 15:09:17 UTC
[1/2] incubator-carbondata git commit: Fixed data mismatch due to
min/max calculation in V3 format
Repository: incubator-carbondata
Updated Branches:
refs/heads/master 26c888180 -> f75766119
Fixed data mismatch due to min/max calculation in V3 format
Fixed data mismatch due to min/max calculation in V3 format
Project: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/commit/6b1ab2a8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/tree/6b1ab2a8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/diff/6b1ab2a8
Branch: refs/heads/master
Commit: 6b1ab2a85cf9ef11fc0a9189853e38e1e515d727
Parents: 26c8881
Author: ravipesala <ra...@gmail.com>
Authored: Thu Mar 16 17:15:16 2017 +0530
Committer: jackylk <ja...@huawei.com>
Committed: Thu Mar 16 22:57:50 2017 +0800
----------------------------------------------------------------------
.../core/util/CarbonMetadataUtil.java | 21 ++++++-
.../carbondata/CarbonDataSourceSuite.scala | 61 ++++++++++++++++++++
2 files changed, 80 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/6b1ab2a8/core/src/main/java/org/apache/carbondata/core/util/CarbonMetadataUtil.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/util/CarbonMetadataUtil.java b/core/src/main/java/org/apache/carbondata/core/util/CarbonMetadataUtil.java
index 03f43cd..f134f0c 100644
--- a/core/src/main/java/org/apache/carbondata/core/util/CarbonMetadataUtil.java
+++ b/core/src/main/java/org/apache/carbondata/core/util/CarbonMetadataUtil.java
@@ -245,12 +245,29 @@ public class CarbonMetadataUtil {
public static BlockletIndex getBlockletIndex(List<NodeHolder> nodeHolderList,
List<CarbonMeasure> carbonMeasureList) {
BlockletMinMaxIndex blockletMinMaxIndex = new BlockletMinMaxIndex();
- for (byte[] max : nodeHolderList.get(nodeHolderList.size() - 1).getColumnMaxData()) {
+ // Calculating min/max for every each column.
+ byte[][] minCol = nodeHolderList.get(0).getColumnMinData().clone();
+ byte[][] maxCol = nodeHolderList.get(0).getColumnMaxData().clone();
+ for (NodeHolder nodeHolder : nodeHolderList) {
+ byte[][] columnMaxData = nodeHolder.getColumnMaxData();
+ byte[][] columnMinData = nodeHolder.getColumnMinData();
+ for (int i = 0; i < maxCol.length; i++) {
+ if (ByteUtil.UnsafeComparer.INSTANCE.compareTo(columnMaxData[i], maxCol[i]) > 0) {
+ maxCol[i] = columnMaxData[i];
+ }
+ if (ByteUtil.UnsafeComparer.INSTANCE.compareTo(columnMinData[i], minCol[i]) < 0) {
+ minCol[i] = columnMinData[i];
+ }
+ }
+ }
+ // Writing min/max to thrift file
+ for (byte[] max : maxCol) {
blockletMinMaxIndex.addToMax_values(ByteBuffer.wrap(max));
}
- for (byte[] min : nodeHolderList.get(0).getColumnMinData()) {
+ for (byte[] min : minCol) {
blockletMinMaxIndex.addToMin_values(ByteBuffer.wrap(min));
}
+
byte[][] measureMaxValue = nodeHolderList.get(0).getMeasureColumnMaxData().clone();
byte[][] measureMinValue = nodeHolderList.get(0).getMeasureColumnMinData().clone();
byte[] minVal = null;
http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/6b1ab2a8/integration/spark2/src/test/scala/org/apache/spark/carbondata/CarbonDataSourceSuite.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/test/scala/org/apache/spark/carbondata/CarbonDataSourceSuite.scala b/integration/spark2/src/test/scala/org/apache/spark/carbondata/CarbonDataSourceSuite.scala
index 97a180b..5cd6ea0 100644
--- a/integration/spark2/src/test/scala/org/apache/spark/carbondata/CarbonDataSourceSuite.scala
+++ b/integration/spark2/src/test/scala/org/apache/spark/carbondata/CarbonDataSourceSuite.scala
@@ -18,8 +18,12 @@
package org.apache.spark.carbondata
import org.apache.spark.sql.common.util.QueryTest
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.{Row, SaveMode}
import org.scalatest.BeforeAndAfterAll
+import org.apache.carbondata.core.util.CarbonProperties
+
class CarbonDataSourceSuite extends QueryTest with BeforeAndAfterAll {
override def beforeAll(): Unit = {
// Drop table
@@ -76,4 +80,61 @@ class CarbonDataSourceSuite extends QueryTest with BeforeAndAfterAll {
""".stripMargin)
}
+ test("Data mismatch because of min/max calculation while loading the data") {
+ CarbonProperties.getInstance()
+ .addProperty("carbon.blockletgroup.size.in.mb", "16")
+ .addProperty("carbon.enable.vector.reader", "true")
+ .addProperty("enable.unsafe.sort", "true")
+
+ val rdd = sqlContext.sparkContext
+ .parallelize(1 to 1200000, 4)
+ .map { x =>
+ ("city" + x % 8, "country" + x % 1103, "planet" + x % 10007, x.toString,
+ (x % 16).toShort, x / 2, (x << 1).toLong, x.toDouble / 13, x.toDouble / 11)
+ }.map { x =>
+ Row(x._1, x._2, x._3, x._4, x._5, x._6, x._7, x._8, x._9)
+ }
+
+ val schema = StructType(
+ Seq(
+ StructField("city", StringType, nullable = false),
+ StructField("country", StringType, nullable = false),
+ StructField("planet", StringType, nullable = false),
+ StructField("id", StringType, nullable = false),
+ StructField("m1", ShortType, nullable = false),
+ StructField("m2", IntegerType, nullable = false),
+ StructField("m3", LongType, nullable = false),
+ StructField("m4", DoubleType, nullable = false),
+ StructField("m5", DoubleType, nullable = false)
+ )
+ )
+
+ val input = sqlContext.createDataFrame(rdd, schema)
+ sql(s"drop table if exists testBigData")
+ input.write
+ .format("carbondata")
+ .option("tableName", "testBigData")
+ .option("tempCSV", "false")
+ .option("single_pass", "true")
+ .option("dictionary_exclude", "id") // id is high cardinality column
+ .mode(SaveMode.Overwrite)
+ .save()
+ sql(s"select city, sum(m1) from testBigData " +
+ s"where country='country12' group by city order by city").show()
+ checkAnswer(
+ sql(s"select city, sum(m1) from testBigData " +
+ s"where country='country12' group by city order by city"),
+ Seq(Row("city0", 544),
+ Row("city1", 680),
+ Row("city2", 816),
+ Row("city3", 952),
+ Row("city4", 1088),
+ Row("city5", 1224),
+ Row("city6", 1360),
+ Row("city7", 1496)))
+ sql(s"drop table if exists testBigData")
+ CarbonProperties.getInstance()
+ .addProperty("carbon.blockletgroup.size.in.mb", "64")
+ }
+
}
[2/2] incubator-carbondata git commit: [CARBONDATA-786] Data mismatch
if the data data is loaded across blocklet groups This closes #664
Posted by ja...@apache.org.
[CARBONDATA-786] Data mismatch if the data data is loaded across blocklet groups This closes #664
Project: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/commit/f7576611
Tree: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/tree/f7576611
Diff: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/diff/f7576611
Branch: refs/heads/master
Commit: f75766119036a05c09f57a40f7697c5ff9663bbc
Parents: 26c8881 6b1ab2a
Author: jackylk <ja...@huawei.com>
Authored: Thu Mar 16 23:09:02 2017 +0800
Committer: jackylk <ja...@huawei.com>
Committed: Thu Mar 16 23:09:02 2017 +0800
----------------------------------------------------------------------
.../core/util/CarbonMetadataUtil.java | 21 ++++++-
.../carbondata/CarbonDataSourceSuite.scala | 61 ++++++++++++++++++++
2 files changed, 80 insertions(+), 2 deletions(-)
----------------------------------------------------------------------