You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by ju...@apache.org on 2017/05/12 22:02:30 UTC

parquet-mr git commit: PARQUET-196: parquet-tools command for row count & size

Repository: parquet-mr
Updated Branches:
  refs/heads/master a703ee75c -> fd7cfed07


PARQUET-196: parquet-tools command for row count & size

This is a rebase on already existing PR-
https://github.com/apache/parquet-mr/pull/132

Author: Swapnil Shinde <sw...@gmail.com>

Closes #406 from swapnilushinde/master and squashes the following commits:

59a8980 [Swapnil Shinde] Spacing to conform java style (if/for) is fixed
5fd0279 [Swapnil Shinde] Parquet-196: parquet-tools command for row count & size


Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/fd7cfed0
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/fd7cfed0
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/fd7cfed0

Branch: refs/heads/master
Commit: fd7cfed070c2aab60521afb7dcc633a0b7abea80
Parents: a703ee7
Author: Swapnil Shinde <sw...@gmail.com>
Authored: Fri May 12 15:02:27 2017 -0700
Committer: Julien Le Dem <ju...@apache.org>
Committed: Fri May 12 15:02:27 2017 -0700

----------------------------------------------------------------------
 .../apache/parquet/tools/command/Registry.java  |   2 +
 .../parquet/tools/command/RowCountCommand.java  |  97 +++++++++++++
 .../parquet/tools/command/SizeCommand.java      | 140 +++++++++++++++++++
 parquet-tools/src/main/scripts/parquet-rowcount |  28 ++++
 parquet-tools/src/main/scripts/parquet-size     |  28 ++++
 5 files changed, 295 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/fd7cfed0/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java
index 0e69f48..6df84be 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java
@@ -32,6 +32,8 @@ public final class Registry {
     registry.put("meta", ShowMetaCommand.class);
     registry.put("dump", DumpCommand.class);
     registry.put("merge", MergeCommand.class);
+    registry.put("rowcount", RowCountCommand.class);
+    registry.put("size", SizeCommand.class);
   }
 
   public static Map<String,Command> allCommands() {

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/fd7cfed0/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java
new file mode 100644
index 0000000..37d6079
--- /dev/null
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.tools.command;
+
+import java.io.PrintWriter;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+
+import org.apache.parquet.hadoop.Footer;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
+import org.apache.parquet.tools.Main;
+
+public class RowCountCommand extends ArgsOnlyCommand {
+  private FileStatus[] inputFileStatuses;
+  private Configuration conf;
+  private Path inputPath;
+  private PrintWriter out;
+  public static final String[] USAGE = new String[] {
+          "<input>",
+          "where <input> is the parquet file to count rows to stdout"
+  };
+
+  public static final Options OPTIONS;
+  static {
+    OPTIONS = new Options();
+    Option detailed = OptionBuilder.withLongOpt("detailed")
+            .withDescription("Detailed rowcount of each matching file")
+            .create('d');
+    OPTIONS.addOption(detailed);
+  }
+
+  public RowCountCommand() {
+    super(1, 1);
+  }
+
+  @Override
+  public Options getOptions() {
+    return OPTIONS;
+  }
+
+  @Override
+  public String[] getUsageDescription() {
+    return USAGE;
+  }
+
+  @Override
+  public void execute(CommandLine options) throws Exception {
+    super.execute(options);
+
+    String[] args = options.getArgs();
+    String input = args[0];
+    out = new PrintWriter(Main.out, true);
+    inputPath = new Path(input);
+    conf = new Configuration();
+    inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath);
+    long rowCount = 0;
+
+    for (FileStatus fs : inputFileStatuses) {
+      long fileRowCount=0;
+      for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) {
+        for (BlockMetaData b : f.getParquetMetadata().getBlocks()) {
+          rowCount += b.getRowCount();
+          fileRowCount += b.getRowCount();
+        }
+      }
+      if (options.hasOption('d')) {
+        out.format("%s row count: %d\n", fs.getPath().getName(), fileRowCount);
+      }
+    }
+
+    out.format("Total RowCount: %d", rowCount);
+    out.println();
+  }
+}

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/fd7cfed0/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java
new file mode 100644
index 0000000..bcc6704
--- /dev/null
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.tools.command;
+
+import java.io.PrintWriter;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+
+import org.apache.parquet.hadoop.Footer;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
+import org.apache.parquet.tools.Main;
+
+public class SizeCommand extends ArgsOnlyCommand {
+  private FileStatus[] inputFileStatuses;
+  private Configuration conf;
+  private Path inputPath;
+  private PrintWriter out;
+  private static final double ONE_KB = 1024;
+  private static final double ONE_MB = ONE_KB * 1024;
+  private static final double ONE_GB = ONE_MB * 1024;
+  private static final double ONE_TB = ONE_GB * 1024;
+  private static final double ONE_PB = ONE_TB * 1024;
+
+  public static final String[] USAGE = new String[] {
+          "<input>",
+          "where <input> is the parquet file to get size & human readable size to stdout"
+  };
+
+  public static final Options OPTIONS;
+  static {
+    OPTIONS = new Options();
+    Option help = OptionBuilder.withLongOpt("pretty")
+            .withDescription("Pretty size")
+            .create('p');
+    OPTIONS.addOption(help);
+    Option uncompressed = OptionBuilder.withLongOpt("uncompressed")
+            .withDescription("Uncompressed size")
+            .create('u');
+    OPTIONS.addOption(uncompressed);
+    Option detailed = OptionBuilder.withLongOpt("detailed")
+            .withDescription("Detailed size of each matching file")
+            .create('d');
+    OPTIONS.addOption(detailed);
+  }
+
+  public SizeCommand() {
+    super(1, 1);
+  }
+
+  @Override
+  public Options getOptions() {
+    return OPTIONS;
+  }
+
+  @Override
+  public String[] getUsageDescription() {
+    return USAGE;
+  }
+
+  @Override
+  public void execute(CommandLine options) throws Exception {
+    super.execute(options);
+
+    String[] args = options.getArgs();
+    String input = args[0];
+    out = new PrintWriter(Main.out, true);
+    inputPath = new Path(input);
+    conf = new Configuration();
+    inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath);
+    long size = 0;
+    for (FileStatus fs : inputFileStatuses) {
+      long fileSize = 0;
+      for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) {
+        for (BlockMetaData b : f.getParquetMetadata().getBlocks()) {
+          size += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize());
+          fileSize += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize());
+        }
+      }
+      if (options.hasOption('d')) {
+        if (options.hasOption('p')) {
+          out.format("%s: %s\n", fs.getPath().getName(), getPrettySize(fileSize));
+        }
+        else {
+          out.format("%s: %d bytes\n", fs.getPath().getName(), fileSize);
+        }
+      }
+    }
+
+    if (options.hasOption('p')) {
+      out.format("Total Size: %s", getPrettySize(size));
+    }
+    else {
+      out.format("Total Size: %d bytes", size);
+    }
+    out.println();
+  }
+
+  public String getPrettySize(long bytes){
+    if (bytes/ONE_KB < 1) {
+      return  String.format("%d", bytes) + " bytes";
+    }
+    if (bytes/ONE_MB < 1) {
+      return String.format("%.3f", bytes/ONE_KB) + " KB";
+    }
+    if (bytes/ONE_GB < 1) {
+      return String.format("%.3f", bytes/ONE_MB) + " MB";
+    }
+    if (bytes/ONE_TB < 1) {
+      return String.format("%.3f", bytes/ONE_GB) + " GB";
+    }
+    if (bytes/ONE_PB < 1) {
+      return String.format("%.3f", bytes/ONE_TB) + " TB";
+    }
+    return String.format("%.3f", bytes/ONE_PB) + " PB";
+  }
+}

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/fd7cfed0/parquet-tools/src/main/scripts/parquet-rowcount
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/scripts/parquet-rowcount b/parquet-tools/src/main/scripts/parquet-rowcount
new file mode 100644
index 0000000..ab12e71
--- /dev/null
+++ b/parquet-tools/src/main/scripts/parquet-rowcount
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# The name of the top-level script
+TOPSCRIPT="parquet-tools"
+
+# Determine the path to the script's directory
+APPPATH=$( cd "$(dirname "$0")" ; pwd -P )
+
+# Run the application
+exec "${APPPATH}/${TOPSCRIPT}" rowcount "$@"

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/fd7cfed0/parquet-tools/src/main/scripts/parquet-size
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/scripts/parquet-size b/parquet-tools/src/main/scripts/parquet-size
new file mode 100644
index 0000000..c9048b0
--- /dev/null
+++ b/parquet-tools/src/main/scripts/parquet-size
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# The name of the top-level script
+TOPSCRIPT="parquet-tools"
+
+# Determine the path to the script's directory
+APPPATH=$( cd "$(dirname "$0")" ; pwd -P )
+
+# Run the application
+exec "${APPPATH}/${TOPSCRIPT}" size "$@"