You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by ju...@apache.org on 2017/05/12 22:02:30 UTC
parquet-mr git commit: PARQUET-196: parquet-tools command for row
count & size
Repository: parquet-mr
Updated Branches:
refs/heads/master a703ee75c -> fd7cfed07
PARQUET-196: parquet-tools command for row count & size
This is a rebase on already existing PR-
https://github.com/apache/parquet-mr/pull/132
Author: Swapnil Shinde <sw...@gmail.com>
Closes #406 from swapnilushinde/master and squashes the following commits:
59a8980 [Swapnil Shinde] Spacing to conform java style (if/for) is fixed
5fd0279 [Swapnil Shinde] Parquet-196: parquet-tools command for row count & size
Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/fd7cfed0
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/fd7cfed0
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/fd7cfed0
Branch: refs/heads/master
Commit: fd7cfed070c2aab60521afb7dcc633a0b7abea80
Parents: a703ee7
Author: Swapnil Shinde <sw...@gmail.com>
Authored: Fri May 12 15:02:27 2017 -0700
Committer: Julien Le Dem <ju...@apache.org>
Committed: Fri May 12 15:02:27 2017 -0700
----------------------------------------------------------------------
.../apache/parquet/tools/command/Registry.java | 2 +
.../parquet/tools/command/RowCountCommand.java | 97 +++++++++++++
.../parquet/tools/command/SizeCommand.java | 140 +++++++++++++++++++
parquet-tools/src/main/scripts/parquet-rowcount | 28 ++++
parquet-tools/src/main/scripts/parquet-size | 28 ++++
5 files changed, 295 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/fd7cfed0/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java
index 0e69f48..6df84be 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java
@@ -32,6 +32,8 @@ public final class Registry {
registry.put("meta", ShowMetaCommand.class);
registry.put("dump", DumpCommand.class);
registry.put("merge", MergeCommand.class);
+ registry.put("rowcount", RowCountCommand.class);
+ registry.put("size", SizeCommand.class);
}
public static Map<String,Command> allCommands() {
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/fd7cfed0/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java
new file mode 100644
index 0000000..37d6079
--- /dev/null
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/RowCountCommand.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.tools.command;
+
+import java.io.PrintWriter;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+
+import org.apache.parquet.hadoop.Footer;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
+import org.apache.parquet.tools.Main;
+
+public class RowCountCommand extends ArgsOnlyCommand {
+ private FileStatus[] inputFileStatuses;
+ private Configuration conf;
+ private Path inputPath;
+ private PrintWriter out;
+ public static final String[] USAGE = new String[] {
+ "<input>",
+ "where <input> is the parquet file to count rows to stdout"
+ };
+
+ public static final Options OPTIONS;
+ static {
+ OPTIONS = new Options();
+ Option detailed = OptionBuilder.withLongOpt("detailed")
+ .withDescription("Detailed rowcount of each matching file")
+ .create('d');
+ OPTIONS.addOption(detailed);
+ }
+
+ public RowCountCommand() {
+ super(1, 1);
+ }
+
+ @Override
+ public Options getOptions() {
+ return OPTIONS;
+ }
+
+ @Override
+ public String[] getUsageDescription() {
+ return USAGE;
+ }
+
+ @Override
+ public void execute(CommandLine options) throws Exception {
+ super.execute(options);
+
+ String[] args = options.getArgs();
+ String input = args[0];
+ out = new PrintWriter(Main.out, true);
+ inputPath = new Path(input);
+ conf = new Configuration();
+ inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath);
+ long rowCount = 0;
+
+ for (FileStatus fs : inputFileStatuses) {
+ long fileRowCount=0;
+ for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) {
+ for (BlockMetaData b : f.getParquetMetadata().getBlocks()) {
+ rowCount += b.getRowCount();
+ fileRowCount += b.getRowCount();
+ }
+ }
+ if (options.hasOption('d')) {
+ out.format("%s row count: %d\n", fs.getPath().getName(), fileRowCount);
+ }
+ }
+
+ out.format("Total RowCount: %d", rowCount);
+ out.println();
+ }
+}
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/fd7cfed0/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java b/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java
new file mode 100644
index 0000000..bcc6704
--- /dev/null
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/SizeCommand.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.tools.command;
+
+import java.io.PrintWriter;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+
+import org.apache.parquet.hadoop.Footer;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
+import org.apache.parquet.tools.Main;
+
+public class SizeCommand extends ArgsOnlyCommand {
+ private FileStatus[] inputFileStatuses;
+ private Configuration conf;
+ private Path inputPath;
+ private PrintWriter out;
+ private static final double ONE_KB = 1024;
+ private static final double ONE_MB = ONE_KB * 1024;
+ private static final double ONE_GB = ONE_MB * 1024;
+ private static final double ONE_TB = ONE_GB * 1024;
+ private static final double ONE_PB = ONE_TB * 1024;
+
+ public static final String[] USAGE = new String[] {
+ "<input>",
+ "where <input> is the parquet file to get size & human readable size to stdout"
+ };
+
+ public static final Options OPTIONS;
+ static {
+ OPTIONS = new Options();
+ Option help = OptionBuilder.withLongOpt("pretty")
+ .withDescription("Pretty size")
+ .create('p');
+ OPTIONS.addOption(help);
+ Option uncompressed = OptionBuilder.withLongOpt("uncompressed")
+ .withDescription("Uncompressed size")
+ .create('u');
+ OPTIONS.addOption(uncompressed);
+ Option detailed = OptionBuilder.withLongOpt("detailed")
+ .withDescription("Detailed size of each matching file")
+ .create('d');
+ OPTIONS.addOption(detailed);
+ }
+
+ public SizeCommand() {
+ super(1, 1);
+ }
+
+ @Override
+ public Options getOptions() {
+ return OPTIONS;
+ }
+
+ @Override
+ public String[] getUsageDescription() {
+ return USAGE;
+ }
+
+ @Override
+ public void execute(CommandLine options) throws Exception {
+ super.execute(options);
+
+ String[] args = options.getArgs();
+ String input = args[0];
+ out = new PrintWriter(Main.out, true);
+ inputPath = new Path(input);
+ conf = new Configuration();
+ inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath);
+ long size = 0;
+ for (FileStatus fs : inputFileStatuses) {
+ long fileSize = 0;
+ for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) {
+ for (BlockMetaData b : f.getParquetMetadata().getBlocks()) {
+ size += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize());
+ fileSize += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize());
+ }
+ }
+ if (options.hasOption('d')) {
+ if (options.hasOption('p')) {
+ out.format("%s: %s\n", fs.getPath().getName(), getPrettySize(fileSize));
+ }
+ else {
+ out.format("%s: %d bytes\n", fs.getPath().getName(), fileSize);
+ }
+ }
+ }
+
+ if (options.hasOption('p')) {
+ out.format("Total Size: %s", getPrettySize(size));
+ }
+ else {
+ out.format("Total Size: %d bytes", size);
+ }
+ out.println();
+ }
+
+ public String getPrettySize(long bytes){
+ if (bytes/ONE_KB < 1) {
+ return String.format("%d", bytes) + " bytes";
+ }
+ if (bytes/ONE_MB < 1) {
+ return String.format("%.3f", bytes/ONE_KB) + " KB";
+ }
+ if (bytes/ONE_GB < 1) {
+ return String.format("%.3f", bytes/ONE_MB) + " MB";
+ }
+ if (bytes/ONE_TB < 1) {
+ return String.format("%.3f", bytes/ONE_GB) + " GB";
+ }
+ if (bytes/ONE_PB < 1) {
+ return String.format("%.3f", bytes/ONE_TB) + " TB";
+ }
+ return String.format("%.3f", bytes/ONE_PB) + " PB";
+ }
+}
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/fd7cfed0/parquet-tools/src/main/scripts/parquet-rowcount
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/scripts/parquet-rowcount b/parquet-tools/src/main/scripts/parquet-rowcount
new file mode 100644
index 0000000..ab12e71
--- /dev/null
+++ b/parquet-tools/src/main/scripts/parquet-rowcount
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# The name of the top-level script
+TOPSCRIPT="parquet-tools"
+
+# Determine the path to the script's directory
+APPPATH=$( cd "$(dirname "$0")" ; pwd -P )
+
+# Run the application
+exec "${APPPATH}/${TOPSCRIPT}" rowcount "$@"
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/fd7cfed0/parquet-tools/src/main/scripts/parquet-size
----------------------------------------------------------------------
diff --git a/parquet-tools/src/main/scripts/parquet-size b/parquet-tools/src/main/scripts/parquet-size
new file mode 100644
index 0000000..c9048b0
--- /dev/null
+++ b/parquet-tools/src/main/scripts/parquet-size
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# The name of the top-level script
+TOPSCRIPT="parquet-tools"
+
+# Determine the path to the script's directory
+APPPATH=$( cd "$(dirname "$0")" ; pwd -P )
+
+# Run the application
+exec "${APPPATH}/${TOPSCRIPT}" size "$@"