You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@carbondata.apache.org by ja...@apache.org on 2017/11/29 14:21:29 UTC

carbondata git commit: [CARBONDATA-1756] Improve Boolean data compress rate by changing RLE to SNAPPY algorithm

Repository: carbondata
Updated Branches:
  refs/heads/master 5f0f66f32 -> 06473484b


[CARBONDATA-1756] Improve Boolean data compress rate by changing RLE to SNAPPY algorithm

Improve Boolean data compress rate by changing RLE to SNAPPY algorithm
Because Boolean data compress rate that uses RLE algorithm is lower than SNAPPY algorithm in most scenario.

This PR only changed about 4 lines for compress algorithm, and we also add some test cases for testing Boolean data compress rate.

This closes #1523


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/06473484
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/06473484
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/06473484

Branch: refs/heads/master
Commit: 06473484b6a824a7a4abc9c4f4e3ce86b62ac49b
Parents: 5f0f66f
Author: xubo245 <60...@qq.com>
Authored: Fri Nov 17 17:17:43 2017 +0800
Committer: Jacky Li <ja...@qq.com>
Committed: Wed Nov 29 22:21:10 2017 +0800

----------------------------------------------------------------------
 .../core/datastore/page/LazyColumnPage.java     |   2 +-
 .../page/encoding/DefaultEncodingFactory.java   |   3 +-
 .../page/encoding/EncodingFactory.java          |   4 +-
 .../compress/TestBooleanCompressSuite.scala     | 111 +++++++++++++++++++
 4 files changed, 115 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/06473484/core/src/main/java/org/apache/carbondata/core/datastore/page/LazyColumnPage.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/LazyColumnPage.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/LazyColumnPage.java
index f2cb860..ce8aaae 100644
--- a/core/src/main/java/org/apache/carbondata/core/datastore/page/LazyColumnPage.java
+++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/LazyColumnPage.java
@@ -267,7 +267,7 @@ public class LazyColumnPage extends ColumnPage {
 
   @Override
   public byte getByte(int rowId) {
-    throw new UnsupportedOperationException("internal error");
+    return columnPage.getByte(rowId);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/carbondata/blob/06473484/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/DefaultEncodingFactory.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/DefaultEncodingFactory.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/DefaultEncodingFactory.java
index 54467b2..0e32115 100644
--- a/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/DefaultEncodingFactory.java
+++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/DefaultEncodingFactory.java
@@ -33,7 +33,6 @@ import org.apache.carbondata.core.datastore.page.encoding.dimension.legacy.Compl
 import org.apache.carbondata.core.datastore.page.encoding.dimension.legacy.DictDimensionIndexCodec;
 import org.apache.carbondata.core.datastore.page.encoding.dimension.legacy.DirectDictDimensionIndexCodec;
 import org.apache.carbondata.core.datastore.page.encoding.dimension.legacy.HighCardDictDimensionIndexCodec;
-import org.apache.carbondata.core.datastore.page.encoding.rle.RLECodec;
 import org.apache.carbondata.core.datastore.page.statistics.SimpleStatsResult;
 import org.apache.carbondata.core.metadata.datatype.DataType;
 import org.apache.carbondata.core.metadata.datatype.DataTypes;
@@ -116,7 +115,7 @@ public class DefaultEncodingFactory extends EncodingFactory {
     SimpleStatsResult stats = columnPage.getStatistics();
     DataType dataType = stats.getDataType();
     if (dataType == DataTypes.BOOLEAN) {
-      return new RLECodec().createEncoder(null);
+      return new DirectCompressCodec(columnPage.getDataType()).createEncoder(null);
     } else if (dataType == DataTypes.BYTE ||
         dataType == DataTypes.SHORT ||
         dataType == DataTypes.INT ||

http://git-wip-us.apache.org/repos/asf/carbondata/blob/06473484/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/EncodingFactory.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/EncodingFactory.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/EncodingFactory.java
index 4a674e3..6d96b3b 100644
--- a/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/EncodingFactory.java
+++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/EncodingFactory.java
@@ -98,9 +98,9 @@ public abstract class EncodingFactory {
       metadata.readFields(in);
       return new RLECodec().createDecoder(metadata);
     } else if (encoding == BOOL_BYTE) {
-      RLEEncoderMeta metadata = new RLEEncoderMeta();
+      ColumnPageEncoderMeta metadata = new ColumnPageEncoderMeta();
       metadata.readFields(in);
-      return new RLECodec().createDecoder(metadata);
+      return new DirectCompressCodec(metadata.getStoreDataType()).createDecoder(metadata);
     } else {
       // for backward compatibility
       ValueEncoderMeta metadata = CarbonUtil.deserializeEncoderMetaV3(encoderMeta);

http://git-wip-us.apache.org/repos/asf/carbondata/blob/06473484/integration/spark2/src/test/scala/org/apache/carbondata/spark/testsuite/booleantype/compress/TestBooleanCompressSuite.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/test/scala/org/apache/carbondata/spark/testsuite/booleantype/compress/TestBooleanCompressSuite.scala b/integration/spark2/src/test/scala/org/apache/carbondata/spark/testsuite/booleantype/compress/TestBooleanCompressSuite.scala
new file mode 100644
index 0000000..230caaf
--- /dev/null
+++ b/integration/spark2/src/test/scala/org/apache/carbondata/spark/testsuite/booleantype/compress/TestBooleanCompressSuite.scala
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.spark.testsuite.booleantype.compress
+
+import java.io.{File, PrintWriter}
+
+import scala.util.Random
+
+import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach}
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.test.util.QueryTest
+
+import org.apache.carbondata.core.util.CarbonProperties
+
+class TestBooleanCompressSuite extends QueryTest with BeforeAndAfterEach with BeforeAndAfterAll {
+  val rootPath = new File(this.getClass.getResource("/").getPath
+    + "../..").getCanonicalPath
+
+  override def beforeEach(): Unit = {
+    sql("drop table if exists boolean_table")
+  }
+
+  override def afterAll(): Unit = {
+    sql("drop table if exists boolean_table")
+    assert(deleteFile(randomBoolean))
+  }
+
+  val randomBoolean = s"$rootPath/src/test/resources/bool/supportRandomBooleanBigFile.csv"
+  val trueNum = 10000000
+
+  override def beforeAll(): Unit = {
+    assert(createBooleanFileRandom(randomBoolean, trueNum, 0.5))
+    CarbonProperties.getInstance()
+      .addProperty("carbon.storelocation", s"$rootPath/target/warehouse/")
+  }
+
+  test("test boolean compress rate: random file") {
+    sql(
+      s"""
+         | CREATE TABLE boolean_table(
+         | booleanField BOOLEAN
+         | )
+         | STORED BY 'carbondata'
+       """.stripMargin)
+
+    sql(
+      s"""
+         | LOAD DATA LOCAL INPATH '${randomBoolean}'
+         | INTO TABLE boolean_table
+         | options('FILEHEADER'='booleanField')
+           """.stripMargin)
+
+    //    Test for compress rate
+    //    sql("select * from boolean_table").show(100)
+    //    sql("select count(*) from boolean_table").show()
+    //    sql("select count(*) from boolean_table where booleanField= true").show()
+    //    sql("select count(*) from boolean_table where booleanField= false").show()
+    checkAnswer(
+      sql("select count(*) from boolean_table"),
+      Row(trueNum))
+  }
+
+  val randomNumber = 10000
+  def createBooleanFileRandom(path: String, totalLines: Int, rate: Double): Boolean = {
+    try {
+      val write = new PrintWriter(path)
+      var d: Double = 0.0
+      val random = new Random()
+      for (i <- 0 until totalLines) {
+        val eachNum = random.nextInt(randomNumber)
+        var flag: Boolean = true
+        if (eachNum >= randomNumber * rate) {
+          flag = false
+        }
+        write.println(flag)
+        d = d + 1
+      }
+
+      write.close()
+    } catch {
+      case _: Exception => assert(false)
+    }
+    return true
+  }
+
+  def deleteFile(path: String): Boolean = {
+    try {
+      val file = new File(path)
+      file.delete()
+    } catch {
+      case _: Exception => assert(false)
+    }
+    return true
+  }
+}