You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@carbondata.apache.org by ra...@apache.org on 2018/10/03 14:47:16 UTC

carbondata git commit: [CARBONDATA-2976] Support dumping column chunk metadata in CarbonCli

Repository: carbondata
Updated Branches:
  refs/heads/master fa9c8323c -> d8003a31c


[CARBONDATA-2976] Support dumping column chunk metadata in CarbonCli

By using -k option, CarbonCli will print all column chunk/page metadata for the specified column
For example, java CarbonCli -cmd summary -p /home/root1/bin /home/root1/.local/bin /usr/local/sbin /usr/local/bin /usr/sbin /usr/bin /sbin /bin /usr/games /usr/local/games /snap/bin /usr/lib/jvm/java-8-oracle/bin /usr/lib/jvm/java-8-oracle/db/bin /usr/lib/jvm/java-8-oracle/jre/bin -c name -k
will output:

## Page Meta for column 'name' in file /Users/jacky/code/carbondata/tools/cli/CarbonCliTest/part-0-138391629343461_batchno0-0-null-138390048546321.carbondata
Blocklet 0:
Page 0 (offset 0, length 12049): DataChunk2(chunk_meta:ChunkCompressionMeta(compression_codec:DEPRECATED, total_uncompressed_size:0, total_compressed_size:0, compressor_name:snappy), rowMajor:false, data_page_length:12039, rowid_page_length:10, presence:PresenceMeta(represents_presence:false, present_bit_stream:00), sort_state:SORT_EXPLICIT, encoders:[INVERTED_INDEX], encoder_meta:[], min_max:BlockletMinMaxIndex(min_values:[72 6F 62 6F 74 30], max_values:[72 6F 62 6F 74 30], min_max_presence:[true]), numberOfRowsInpage:32000)
Page 1 (offset 12049, length 12049): DataChunk2(chunk_meta:ChunkCompressionMeta(compression_codec:DEPRECATED, total_uncompressed_size:0, total_compressed_size:0, compressor_name:snappy), rowMajor:false, data_page_length:12039, rowid_page_length:10, presence:PresenceMeta(represents_presence:false, present_bit_stream:00), sort_state:SORT_EXPLICIT, encoders:[INVERTED_INDEX], encoder_meta:[], min_max:BlockletMinMaxIndex(min_values:[72 6F 62 6F 74 30], max_values:[72 6F 62 6F 74 30], min_max_presence:[true]), numberOfRowsInpage:32000)
Page 2 (offset 24098, length 12049): DataChunk2(chunk_meta:ChunkCompressionMeta(compression_codec:DEPRECATED, total_uncompressed_size:0, total_compressed_size:0, compressor_name:snappy), rowMajor:false, data_page_length:12039, rowid_page_length:10, presence:PresenceMeta(represents_presence:false, present_bit_stream:00), sort_state:SORT_EXPLICIT, encoders:[INVERTED_INDEX], encoder_meta:[], min_max:BlockletMinMaxIndex(min_values:[72 6F 62 6F 74 30], max_values:[72 6F 62 6F 74 30], min_max_presence:[true]), numberOfRowsInpage:32000)

This closes #2771


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/d8003a31
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/d8003a31
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/d8003a31

Branch: refs/heads/master
Commit: d8003a31c602807f00d438d8be392992cb0955ac
Parents: fa9c832
Author: Jacky Li <ja...@qq.com>
Authored: Wed Sep 26 23:51:34 2018 +0800
Committer: ravipesala <ra...@gmail.com>
Committed: Wed Oct 3 20:17:04 2018 +0530

----------------------------------------------------------------------
 .../org/apache/carbondata/tool/CarbonCli.java   |  2 +
 .../org/apache/carbondata/tool/DataFile.java    |  8 +++-
 .../org/apache/carbondata/tool/DataSummary.java | 45 ++++++++++++++++++--
 .../apache/carbondata/tool/CarbonCliTest.java   | 13 ++++++
 4 files changed, 63 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/d8003a31/tools/cli/src/main/java/org/apache/carbondata/tool/CarbonCli.java
----------------------------------------------------------------------
diff --git a/tools/cli/src/main/java/org/apache/carbondata/tool/CarbonCli.java b/tools/cli/src/main/java/org/apache/carbondata/tool/CarbonCli.java
index 5725f8e..f1baa92 100644
--- a/tools/cli/src/main/java/org/apache/carbondata/tool/CarbonCli.java
+++ b/tools/cli/src/main/java/org/apache/carbondata/tool/CarbonCli.java
@@ -65,6 +65,7 @@ public class CarbonCli {
     Option segment = new Option("m", "showSegment", false, "print segment information");
     Option tblProperties = new Option("t", "tblProperties", false, "print table properties");
     Option detail = new Option("b", "blocklet", false, "print blocklet size detail");
+    Option columnMeta = new Option("k", "columnChunkMeta", false, "print column chunk meta");
     Option columnName = OptionBuilder
         .withArgName("column name")
         .hasArg()
@@ -82,6 +83,7 @@ public class CarbonCli {
     options.addOption(segment);
     options.addOption(tblProperties);
     options.addOption(detail);
+    options.addOption(columnMeta);
     options.addOption(columnName);
     return options;
   }

http://git-wip-us.apache.org/repos/asf/carbondata/blob/d8003a31/tools/cli/src/main/java/org/apache/carbondata/tool/DataFile.java
----------------------------------------------------------------------
diff --git a/tools/cli/src/main/java/org/apache/carbondata/tool/DataFile.java b/tools/cli/src/main/java/org/apache/carbondata/tool/DataFile.java
index da81d84..039401e 100644
--- a/tools/cli/src/main/java/org/apache/carbondata/tool/DataFile.java
+++ b/tools/cli/src/main/java/org/apache/carbondata/tool/DataFile.java
@@ -321,6 +321,8 @@ class DataFile {
     // they are set after calculation in DataSummary
     double minPercentage, maxPercentage;
 
+    DataChunk3 dataChunk;
+
     /**
      * Constructor
      * @param blockletInfo blocklet info which this column chunk belongs to
@@ -338,7 +340,7 @@ class DataFile {
       ByteBuffer buffer = fileReader.readByteBuffer(
           filePath, blockletInfo.column_data_chunks_offsets.get(columnIndex),
           blockletInfo.column_data_chunks_length.get(columnIndex));
-      DataChunk3 dataChunk = CarbonUtil.readDataChunk3(new ByteArrayInputStream(buffer.array()));
+      dataChunk = CarbonUtil.readDataChunk3(new ByteArrayInputStream(buffer.array()));
       this.localDict = dataChunk.isSetLocal_dictionary();
       if (this.localDict) {
         String compressorName = CarbonMetadataUtil.getCompressorNameFromChunkMeta(
@@ -376,6 +378,10 @@ class DataFile {
       return column.getDataType();
     }
 
+    public DataChunk3 getDataChunk3() {
+      return dataChunk;
+    }
+
     byte[] min(byte[] minValue) {
       if (minValue == null) {
         return min;

http://git-wip-us.apache.org/repos/asf/carbondata/blob/d8003a31/tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java
----------------------------------------------------------------------
diff --git a/tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java b/tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java
index 6463977..5f1fb68 100644
--- a/tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java
+++ b/tools/cli/src/main/java/org/apache/carbondata/tool/DataSummary.java
@@ -38,6 +38,8 @@ import org.apache.carbondata.core.statusmanager.LoadMetadataDetails;
 import org.apache.carbondata.core.statusmanager.SegmentStatusManager;
 import org.apache.carbondata.core.util.CarbonUtil;
 import org.apache.carbondata.format.BlockletInfo3;
+import org.apache.carbondata.format.DataChunk2;
+import org.apache.carbondata.format.DataChunk3;
 import org.apache.carbondata.format.FileFooter3;
 import org.apache.carbondata.format.FileHeader;
 import org.apache.carbondata.format.TableInfo;
@@ -91,6 +93,9 @@ class DataSummary implements Command {
     if (line.hasOption("c")) {
       String columName = line.getOptionValue("c");
       printColumnStats(columName);
+      if (line.hasOption("k")) {
+        printColumnChunkMeta(columName);
+      }
     }
   }
 
@@ -217,13 +222,13 @@ class DataSummary implements Command {
     throw new RuntimeException("schema for column " + columnName + " not found");
   }
 
+  // true if blockled stats are collected
+  private boolean collected = false;
+
   private void printColumnStats(String columnName) throws IOException, MemoryException {
     out.println();
     out.println("## Column Statistics for '" + columnName + "'");
-    for (DataFile dataFile : dataFiles.values()) {
-      dataFile.initAllBlockletStats(columnName);
-    }
-    collectAllBlockletStats(dataFiles.values());
+    collectStats(columnName);
 
     int columnIndex = getColumnIndex(columnName);
     String[] header = new String[]{"BLK", "BLKLT", "Meta Size", "Data Size",
@@ -260,6 +265,38 @@ class DataSummary implements Command {
     printer.printFormatted(out);
   }
 
+  private void collectStats(String columnName) throws IOException, MemoryException {
+    if (!collected) {
+      for (DataFile dataFile : dataFiles.values()) {
+        dataFile.initAllBlockletStats(columnName);
+      }
+      collectAllBlockletStats(dataFiles.values());
+      collected = true;
+    }
+  }
+
+  private void printColumnChunkMeta(String columnName) throws IOException, MemoryException {
+    out.println();
+    DataFile file = dataFiles.entrySet().iterator().next().getValue();
+    out.println("## Page Meta for column '" + columnName + "' in file " + file.getFilePath());
+    collectStats(columnName);
+    for (int i = 0; i < file.getAllBlocklets().size(); i++) {
+      DataFile.Blocklet blocklet = file.getAllBlocklets().get(i);
+      DataChunk3 dataChunk3 = blocklet.getColumnChunk().getDataChunk3();
+      List<DataChunk2> dataChunk2List = dataChunk3.getData_chunk_list();
+      out.println(String.format("Blocklet %d:", i));
+
+      // There will be many pages, for debugging purpose,
+      // just print 3 page for each blocklet is enough
+      for (int j = 0; j < dataChunk2List.size() && j < 3; j++) {
+        out.println(String.format("Page %d (offset %d, length %d): %s",
+            j, dataChunk3.page_offset.get(j), dataChunk3.page_length.get(j),
+            dataChunk2List.get(j).toString()));
+      }
+      out.println("\n");
+    }
+  }
+
   private void collectAllBlockletStats(Collection<DataFile> dataFiles) {
     // shard name mapping to blocklets belong to same shard
     Map<String, List<DataFile.Blocklet>> shards = new HashMap<>();

http://git-wip-us.apache.org/repos/asf/carbondata/blob/d8003a31/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java
----------------------------------------------------------------------
diff --git a/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java b/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java
index fcd46c8..e526131 100644
--- a/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java
+++ b/tools/cli/src/test/java/org/apache/carbondata/tool/CarbonCliTest.java
@@ -184,6 +184,19 @@ public class CarbonCliTest {
   }
 
   @Test
+  public void testSummaryPageMeta() {
+    String[] args = { "-cmd", "summary", "-p", path, "-c", "name", "-k"};
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    PrintStream stream = new PrintStream(out);
+    CarbonCli.run(args, stream);
+    String output = new String(out.toByteArray());
+    Assert.assertTrue(
+        output.contains(
+            "Blocklet 0:\n"
+           + "Page 0 (offset 0, length 12049): DataChunk2(chunk_meta:ChunkCompressionMeta(compression_codec:DEPRECATED, total_uncompressed_size:0, total_compressed_size:0, compressor_name:snappy), rowMajor:false, data_page_length:12039, rowid_page_length:10, presence:PresenceMeta(represents_presence:false, present_bit_stream:00), sort_state:SORT_EXPLICIT, encoders:[INVERTED_INDEX], encoder_meta:[], min_max:BlockletMinMaxIndex(min_values:[72 6F 62 6F 74 30], max_values:[72 6F 62 6F 74 30], min_max_presence:[true]), numberOfRowsInpage:32000)"));
+  }
+
+  @Test
   public void testBenchmark() {
     String[] args = {"-cmd", "benchmark", "-p", path, "-a", "-c", "name"};
     ByteArrayOutputStream out = new ByteArrayOutputStream();