You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@carbondata.apache.org by ma...@apache.org on 2018/10/25 09:03:58 UTC

carbondata git commit: [CARBONDATA-2977] Write uncompress_size to ChunkCompressMeta in the file

Repository: carbondata
Updated Branches:
  refs/heads/master 33a6dc2ac -> e19c5da6d


[CARBONDATA-2977] Write uncompress_size to ChunkCompressMeta in the file

Currently total_uncompressed_size and total_compress_size in the ChunkCompressMeta in the carbondata file is always 0. This PR writes the
correct value to the file.

This closes #2772


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/e19c5da6
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/e19c5da6
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/e19c5da6

Branch: refs/heads/master
Commit: e19c5da6dbb07056b1053319d48a64a4b0715129
Parents: 33a6dc2
Author: Jacky Li <ja...@qq.com>
Authored: Thu Sep 27 00:39:29 2018 +0800
Committer: manishgupta88 <to...@gmail.com>
Committed: Thu Oct 25 14:38:28 2018 +0530

----------------------------------------------------------------------
 .../core/datastore/page/ColumnPage.java         | 39 +++++++++++
 .../datastore/page/LocalDictColumnPage.java     |  9 +++
 .../page/UnsafeFixLengthColumnPage.java         |  7 ++
 .../datastore/page/VarLengthColumnPageBase.java |  5 ++
 .../page/encoding/ColumnPageEncoder.java        |  7 +-
 .../core/util/CarbonMetadataUtil.java           | 10 +--
 .../apache/carbondata/tool/CarbonCliTest.java   | 69 ++++----------------
 7 files changed, 81 insertions(+), 65 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/e19c5da6/core/src/main/java/org/apache/carbondata/core/datastore/page/ColumnPage.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/ColumnPage.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/ColumnPage.java
index 8b9a9a5..e8097da 100644
--- a/core/src/main/java/org/apache/carbondata/core/datastore/page/ColumnPage.java
+++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/ColumnPage.java
@@ -724,6 +724,45 @@ public abstract class ColumnPage {
   }
 
   /**
+   * Return total page data length in bytes
+   */
+  public long getPageLengthInBytes() throws IOException {
+    DataType dataType = columnPageEncoderMeta.getStoreDataType();
+    if (dataType == DataTypes.BOOLEAN) {
+      return getBooleanPage().length;
+    } else if (dataType == DataTypes.BYTE) {
+      return getBytePage().length;
+    } else if (dataType == DataTypes.SHORT) {
+      return getShortPage().length * SHORT.getSizeInBytes();
+    } else if (dataType == DataTypes.SHORT_INT) {
+      return getShortIntPage().length;
+    } else if (dataType == DataTypes.INT) {
+      return getIntPage().length * INT.getSizeInBytes();
+    } else if (dataType == DataTypes.LONG) {
+      return getLongPage().length * LONG.getSizeInBytes();
+    } else if (dataType == DataTypes.FLOAT) {
+      return getFloatPage().length * FLOAT.getSizeInBytes();
+    } else if (dataType == DataTypes.DOUBLE) {
+      return getDoublePage().length * DOUBLE.getSizeInBytes();
+    } else if (DataTypes.isDecimal(dataType)) {
+      return getDecimalPage().length;
+    } else if (dataType == DataTypes.BYTE_ARRAY
+        && columnPageEncoderMeta.getColumnSpec().getColumnType() == ColumnType.COMPLEX_PRIMITIVE) {
+      return getComplexChildrenLVFlattenedBytePage().length;
+    } else if (dataType == DataTypes.BYTE_ARRAY
+        && (columnPageEncoderMeta.getColumnSpec().getColumnType() == ColumnType.COMPLEX_STRUCT
+        || columnPageEncoderMeta.getColumnSpec().getColumnType() == ColumnType.COMPLEX_ARRAY
+        || columnPageEncoderMeta.getColumnSpec().getColumnType() == ColumnType.PLAIN_LONG_VALUE
+        || columnPageEncoderMeta.getColumnSpec().getColumnType() == ColumnType.PLAIN_VALUE)) {
+      return getComplexParentFlattenedBytePage().length;
+    } else if (dataType == DataTypes.BYTE_ARRAY) {
+      return getLVFlattenedBytePage().length;
+    } else {
+      throw new UnsupportedOperationException("unsupport compress column page: " + dataType);
+    }
+  }
+
+  /**
    * Compress page data using specified compressor
    */
   public byte[] compress(Compressor compressor) throws MemoryException, IOException {

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e19c5da6/core/src/main/java/org/apache/carbondata/core/datastore/page/LocalDictColumnPage.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/LocalDictColumnPage.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/LocalDictColumnPage.java
index 3da154a..5cf2130 100644
--- a/core/src/main/java/org/apache/carbondata/core/datastore/page/LocalDictColumnPage.java
+++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/LocalDictColumnPage.java
@@ -344,4 +344,13 @@ public class LocalDictColumnPage extends ColumnPage {
   @Override public void convertValue(ColumnPageValueConverter codec) {
     throw new UnsupportedOperationException("Operation not supported");
   }
+
+  @Override
+  public long getPageLengthInBytes() throws IOException {
+    if (null != pageLevelDictionary) {
+      return encodedDataColumnPage.getPageLengthInBytes();
+    } else {
+      return actualDataColumnPage.getPageLengthInBytes();
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e19c5da6/core/src/main/java/org/apache/carbondata/core/datastore/page/UnsafeFixLengthColumnPage.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/UnsafeFixLengthColumnPage.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/UnsafeFixLengthColumnPage.java
index 7df29df..da0e487 100644
--- a/core/src/main/java/org/apache/carbondata/core/datastore/page/UnsafeFixLengthColumnPage.java
+++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/UnsafeFixLengthColumnPage.java
@@ -534,6 +534,13 @@ public class UnsafeFixLengthColumnPage extends ColumnPage {
     }
   }
 
+  @Override
+  public long getPageLengthInBytes() {
+    // For unsafe column page, we are always tracking the total length
+    // so return it directly instead of calculate it again (super class implementation)
+    return totalLength;
+  }
+
   @Override public byte[] compress(Compressor compressor) throws MemoryException, IOException {
     if (UnsafeMemoryManager.isOffHeap() && compressor.supportUnsafe()) {
       // use raw compression and copy to byte[]

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e19c5da6/core/src/main/java/org/apache/carbondata/core/datastore/page/VarLengthColumnPageBase.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/VarLengthColumnPageBase.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/VarLengthColumnPageBase.java
index 35d0009..39b8282 100644
--- a/core/src/main/java/org/apache/carbondata/core/datastore/page/VarLengthColumnPageBase.java
+++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/VarLengthColumnPageBase.java
@@ -518,4 +518,9 @@ public abstract class VarLengthColumnPageBase extends ColumnPage {
       rowOffset = null;
     }
   }
+
+  @Override
+  public long getPageLengthInBytes() throws IOException {
+    return totalLength;
+  }
 }

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e19c5da6/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/ColumnPageEncoder.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/ColumnPageEncoder.java b/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/ColumnPageEncoder.java
index f38aef2..32eb5ab 100644
--- a/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/ColumnPageEncoder.java
+++ b/core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/ColumnPageEncoder.java
@@ -109,9 +109,10 @@ public abstract class ColumnPageEncoder {
     return dataChunk;
   }
 
-  private void fillBasicFields(ColumnPage inputPage, DataChunk2 dataChunk) {
-    dataChunk.setChunk_meta(
-        CarbonMetadataUtil.getChunkCompressorMeta(inputPage.getColumnCompressorName()));
+  private void fillBasicFields(ColumnPage inputPage, DataChunk2 dataChunk)
+      throws IOException {
+    dataChunk.setChunk_meta(CarbonMetadataUtil.getChunkCompressorMeta(inputPage,
+        dataChunk.getData_page_length()));
     dataChunk.setNumberOfRowsInpage(inputPage.getPageSize());
     dataChunk.setRowMajor(false);
   }

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e19c5da6/core/src/main/java/org/apache/carbondata/core/util/CarbonMetadataUtil.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/carbondata/core/util/CarbonMetadataUtil.java b/core/src/main/java/org/apache/carbondata/core/util/CarbonMetadataUtil.java
index b156ae6..9c82fa4 100644
--- a/core/src/main/java/org/apache/carbondata/core/util/CarbonMetadataUtil.java
+++ b/core/src/main/java/org/apache/carbondata/core/util/CarbonMetadataUtil.java
@@ -26,6 +26,7 @@ import org.apache.carbondata.common.logging.LogServiceFactory;
 import org.apache.carbondata.core.datastore.blocklet.BlockletEncodedColumnPage;
 import org.apache.carbondata.core.datastore.blocklet.EncodedBlocklet;
 import org.apache.carbondata.core.datastore.compression.CompressorFactory;
+import org.apache.carbondata.core.datastore.page.ColumnPage;
 import org.apache.carbondata.core.datastore.page.encoding.EncodedColumnPage;
 import org.apache.carbondata.core.datastore.page.statistics.SimpleStatsResult;
 import org.apache.carbondata.core.datastore.page.statistics.TablePageStatistics;
@@ -313,15 +314,16 @@ public class CarbonMetadataUtil {
    * before 1.5.0, we set a enum 'compression_codec';
    * after 1.5.0, we use string 'compressor_name' instead
    */
-  public static ChunkCompressionMeta getChunkCompressorMeta(String compressorName) {
+  public static ChunkCompressionMeta getChunkCompressorMeta(
+      ColumnPage inputPage, long encodedDataLength) throws IOException {
     ChunkCompressionMeta chunkCompressionMeta = new ChunkCompressionMeta();
     // we will not use this field any longer and will use compressor_name instead,
     // but in thrift definition, this field is required so we cannot set it to null, otherwise
     // it will cause deserialization error in runtime (required field cannot be null).
     chunkCompressionMeta.setCompression_codec(CompressionCodec.DEPRECATED);
-    chunkCompressionMeta.setCompressor_name(compressorName);
-    chunkCompressionMeta.setTotal_compressed_size(0);
-    chunkCompressionMeta.setTotal_uncompressed_size(0);
+    chunkCompressionMeta.setCompressor_name(inputPage.getColumnCompressorName());
+    chunkCompressionMeta.setTotal_compressed_size(encodedDataLength);
+    chunkCompressionMeta.setTotal_uncompressed_size(inputPage.getPageLengthInBytes());
     return chunkCompressionMeta;
   }
 

http://git-wip-us.apache.org/repos/asf/carbondata/blob/e19c5da6/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java
----------------------------------------------------------------------
diff --git a/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java b/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java
index e526131..d53e3d9 100644
--- a/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java
+++ b/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java
@@ -77,7 +77,7 @@ public class CarbonCliTest {
         output.contains(
             "Input Folder: ./CarbonCliTest\n"
           + "## Summary\n"
-          + "total: 6 blocks, 2 shards, 14 blocklets, 314 pages, 10,000,000 rows, 32.26MB\n"
+          + "total: 6 blocks, 2 shards, 14 blocklets, 314 pages, 10,000,000 rows, 32.27MB\n"
           + "avg: 5.38MB/block, 2.30MB/blocklet, 1,666,666 rows/block, 714,285 rows/blocklet"));
 
     String[] args2 = {"-cmd", "summary", "-p", path, "-s"};
@@ -116,7 +116,7 @@ public class CarbonCliTest {
           + "1    1      25        800,000  2.58MB    \n"
           + "2    0      25        800,000  2.58MB    \n"
           + "2    1      25        800,000  2.58MB    \n"
-          + "2    2      7         200,000  660.74KB  "));
+          + "2    2      7         200,000  660.79KB  "));
 
     String[] args5 = {"-cmd", "summary", "-p", path, "-c", "name"};
     out = new ByteArrayOutputStream();
@@ -126,61 +126,13 @@ public class CarbonCliTest {
     Assert.assertTrue(
         output.contains(
             "BLK  BLKLT  Meta Size  Data Size  LocalDict  DictEntries  DictSize  AvgPageSize  Min%    Max%    \n"
-          + "0    0      1.72KB     295.89KB   false      0            0.0B      11.77KB      robot0  robot1  \n"
-          + "0    1      1.72KB     295.89KB   false      0            0.0B      11.77KB      robot1  robot3  \n"
-          + "1    0      1.72KB     295.89KB   false      0            0.0B      11.77KB      robot3  robot4  \n"
-          + "1    1      1.72KB     295.89KB   false      0            0.0B      11.77KB      robot4  robot6  \n"
-          + "2    0      1.72KB     295.89KB   false      0            0.0B      11.77KB      robot6  robot7  \n"
-          + "2    1      1.72KB     295.89KB   false      0            0.0B      11.77KB      robot8  robot9  \n"
-          + "2    2      492.0B     74.03KB    false      0            0.0B      10.51KB      robot9  robot9  "));
-  }
-
-  @Test
-  public void testSummaryOutputAll() {
-    String[] args = {"-cmd", "summary", "-p", path, "-a", "-c", "age"};
-    ByteArrayOutputStream out = new ByteArrayOutputStream();
-    PrintStream stream = new PrintStream(out);
-    CarbonCli.run(args, stream);
-    String output = new String(out.toByteArray());
-    Assert.assertTrue(
-        output.contains(
-            "Input Folder: ./CarbonCliTest\n"
-          + "## Summary\n"
-          + "total: 6 blocks, 2 shards, 14 blocklets, 314 pages, 10,000,000 rows, 32.26MB\n"
-          + "avg: 5.38MB/block, 2.30MB/blocklet, 1,666,666 rows/block, 714,285 rows/blocklet\n"));
-
-    Assert.assertTrue(
-        output.contains(
-            "Column Name  Data Type  Column Type  SortColumn  Encoding          Ordinal  Id  \n"
-          + "name         STRING     dimension    true        [INVERTED_INDEX]  0        NA  \n"
-          + "age          INT        measure      false       []                1        NA  \n"));
-
-    Assert.assertTrue(
-        output.contains(
-            "## Table Properties\n"
-          + "schema file not found"));
-
-    Assert.assertTrue(
-        output.contains(
-            "BLK  BLKLT  NumPages  NumRows  Size      \n"
-          + "0    0      25        800,000  2.58MB    \n"
-          + "0    1      25        800,000  2.58MB    \n"
-          + "1    0      25        800,000  2.58MB    \n"
-          + "1    1      25        800,000  2.58MB    \n"
-          + "2    0      25        800,000  2.58MB    \n"
-          + "2    1      25        800,000  2.58MB    \n"
-          + "2    2      7         200,000  660.74KB  "));
-
-    Assert.assertTrue(
-        output.contains(
-          "BLK  BLKLT  Meta Size  Data Size  LocalDict  DictEntries  DictSize  AvgPageSize  Min%  Max%   \n"
-        + "0    0      2.90KB     4.87MB     false      0            0.0B      93.76KB      0.0   100.0  \n"
-        + "0    1      2.90KB     2.29MB     false      0            0.0B      93.76KB      0.0   100.0  \n"
-        + "1    0      2.90KB     4.87MB     false      0            0.0B      93.76KB      0.0   100.0  \n"
-        + "1    1      2.90KB     2.29MB     false      0            0.0B      93.76KB      0.0   100.0  \n"
-        + "2    0      2.90KB     5.52MB     false      0            0.0B      93.76KB      0.0   100.0  \n"
-        + "2    1      2.90KB     2.94MB     false      0            0.0B      93.76KB      0.0   100.0  \n"
-        + "2    2      830.0B     586.81KB   false      0            0.0B      83.71KB      0.0   100.0 "));
+          + "0    0      1.81KB     295.98KB   false      0            0.0B      11.77KB      robot0  robot1  \n"
+          + "0    1      1.81KB     295.99KB   false      0            0.0B      11.77KB      robot1  robot3  \n"
+          + "1    0      1.81KB     295.98KB   false      0            0.0B      11.77KB      robot3  robot4  \n"
+          + "1    1      1.81KB     295.99KB   false      0            0.0B      11.77KB      robot4  robot6  \n"
+          + "2    0      1.81KB     295.98KB   false      0            0.0B      11.77KB      robot6  robot7  \n"
+          + "2    1      1.81KB     295.98KB   false      0            0.0B      11.77KB      robot8  robot9  \n"
+          + "2    2      519.0B     74.06KB    false      0            0.0B      10.51KB      robot9  robot9  "));
   }
 
   @Test
@@ -190,10 +142,11 @@ public class CarbonCliTest {
     PrintStream stream = new PrintStream(out);
     CarbonCli.run(args, stream);
     String output = new String(out.toByteArray());
+    System.out.println(output);
     Assert.assertTrue(
         output.contains(
             "Blocklet 0:\n"
-           + "Page 0 (offset 0, length 12049): DataChunk2(chunk_meta:ChunkCompressionMeta(compression_codec:DEPRECATED, total_uncompressed_size:0, total_compressed_size:0, compressor_name:snappy), rowMajor:false, data_page_length:12039, rowid_page_length:10, presence:PresenceMeta(represents_presence:false, present_bit_stream:00), sort_state:SORT_EXPLICIT, encoders:[INVERTED_INDEX], encoder_meta:[], min_max:BlockletMinMaxIndex(min_values:[72 6F 62 6F 74 30], max_values:[72 6F 62 6F 74 30], min_max_presence:[true]), numberOfRowsInpage:32000)"));
+           + "Page 0 (offset 0, length 12049): DataChunk2(chunk_meta:ChunkCompressionMeta(compression_codec:DEPRECATED, total_uncompressed_size:256000, total_compressed_size:12049, compressor_name:snappy), rowMajor:false, data_page_length:12039, rowid_page_length:10, presence:PresenceMeta(represents_presence:false, present_bit_stream:00), sort_state:SORT_EXPLICIT, encoders:[INVERTED_INDEX], encoder_meta:[], min_max:BlockletMinMaxIndex(min_values:[72 6F 62 6F 74 30], max_values:[72 6F 62 6F 74 30], min_max_presence:[true]), numberOfRowsInpage:32000)"));
   }
 
   @Test