You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/12/16 00:00:22 UTC
[orc] branch branch-1.7 updated: ORC-1017: Add sizes tool to determine and display the sizes of each column in a set of files. (#925)
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-1.7
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-1.7 by this push:
new 0fbfa66 ORC-1017: Add sizes tool to determine and display the sizes of each column in a set of files. (#925)
0fbfa66 is described below
commit 0fbfa6603b1dc61c7a8b4553ddf7e0158e42fdd5
Author: Owen O'Malley <oo...@linkedin.com>
AuthorDate: Mon Oct 4 23:17:20 2021 -0700
ORC-1017: Add sizes tool to determine and display the sizes of each column in a set of files. (#925)
### What changes were proposed in this pull request?
This patch adds a new tool that accounts for the total size of a set of ORC files. For files written by >= ORC 1.5, you'll get a column breakdown of the file. There are some virtual columns that are included:
- _index the indexes that are used for skipping inside the stripe
- _data the data in files written prior to ORC 1.5
- _stripe_footer the stripe metadata
- _file_footer the file metadata
- _padding padding added to align stripes to HDFS block boundaries
I also added a new method on TypeDescription that gets the full field name, which is the inverse of findSubtype.
### Why are the changes needed?
The tool helps diagnose the compression of a set of files.
### How was this patch tested?
I added a test of the new TypeDescription.getFullFieldName. I ran the tool over some of the examples and some multiple-terabyte directories of production ORC files.
(cherry picked from commit be0762b67b04b9b4592b8a6364addd447c704fc2)
Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
.../src/java/org/apache/orc/TypeDescription.java | 67 +++++++
.../test/org/apache/orc/TestTypeDescription.java | 19 ++
java/tools/pom.xml | 64 +++++--
java/tools/src/findbugs/exclude.xml | 12 +-
.../src/java/org/apache/orc/tools/ColumnSizes.java | 196 +++++++++++++++++++++
.../src/java/org/apache/orc/tools/Driver.java | 4 +
6 files changed, 336 insertions(+), 26 deletions(-)
diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java
index 6e3b05f..10520a8 100644
--- a/java/core/src/java/org/apache/orc/TypeDescription.java
+++ b/java/core/src/java/org/apache/orc/TypeDescription.java
@@ -862,4 +862,71 @@ public class TypeDescription
+ source);
}
}
+
+ /**
+ * Find the index of a given child object using == comparison.
+ * @param child The child type
+ * @return the index 0 to N-1 of the children.
+ */
+ private int getChildIndex(TypeDescription child) {
+ for(int i=children.size() - 1; i >= 0; --i) {
+ if (children.get(i) == child) {
+ return i;
+ }
+ }
+ throw new IllegalArgumentException("Child not found");
+ }
+
+ /**
+ * For a complex type, get the partial name for this child. For structures,
+ * it returns the corresponding field name. For lists and maps, it uses the
+ * special names "_elem", "_key", and "_value". Unions use the integer index.
+ * @param child The desired child, which must be the same object (==)
+ * @return The name of the field for the given child.
+ */
+ private String getPartialName(TypeDescription child) {
+ switch (category) {
+ case LIST:
+ return "_elem";
+ case MAP:
+ return getChildIndex(child) == 0 ? "_key" : "_value";
+ case STRUCT:
+ return fieldNames.get(getChildIndex(child));
+ case UNION:
+ return Integer.toString(getChildIndex(child));
+ default:
+ throw new IllegalArgumentException(
+ "Can't get the field name of a primitive type");
+ }
+ }
+
+ /**
+ * Get the full field name for the given type. For
+ * "struct<a:struct<list<struct<b:int,c:int>>>>" when
+ * called on c, would return "a._elem.c".
+ * @return A string that is the inverse of findSubtype
+ */
+ public String getFullFieldName() {
+ List<String> parts = new ArrayList<>(getId());
+ TypeDescription current = this;
+ TypeDescription parent = current.getParent();
+ // Handle the root as a special case so that it isn't an empty string.
+ if (parent == null) {
+ return Integer.toString(current.getId());
+ }
+ while (parent != null) {
+ parts.add(parent.getPartialName(current));
+ current = parent;
+ parent = current.getParent();
+ }
+ // Put the string together backwards
+ StringBuilder buffer = new StringBuilder();
+ for (int part=parts.size() - 1; part >= 0; --part) {
+ buffer.append(parts.get(part));
+ if (part != 0) {
+ buffer.append('.');
+ }
+ }
+ return buffer.toString();
+ }
}
diff --git a/java/core/src/test/org/apache/orc/TestTypeDescription.java b/java/core/src/test/org/apache/orc/TestTypeDescription.java
index 80e3b77..9203eaf 100644
--- a/java/core/src/test/org/apache/orc/TestTypeDescription.java
+++ b/java/core/src/test/org/apache/orc/TestTypeDescription.java
@@ -490,4 +490,23 @@ public class TestTypeDescription {
assertThrows(IllegalArgumentException.class, () ->
schema.annotateEncryption(null,"nullify:name;sha256:name"));
}
+
+ @Test
+ public void testGetFullFieldName() {
+ TypeDescription schema = TypeDescription.fromString(
+ "struct<" +
+ "name:struct<first:string,last:string>," +
+ "address:struct<street:string,city:string,country:string,post_code:string>," +
+ "credit_cards:array<struct<card_number:string,expire:date,ccv:string>>," +
+ "properties:map<string,uniontype<int,string>>>");
+ for (String column: new String[]{"0", "name", "name.first", "name.last",
+ "address.street", "address.city",
+ "credit_cards", "credit_cards._elem",
+ "credit_cards._elem.card_number",
+ "properties", "properties._key", "properties._value",
+ "properties._value.0", "properties._value.1"}) {
+ assertEquals(column,
+ schema.findSubtype(column, true).getFullFieldName());
+ }
+ }
}
diff --git a/java/tools/pom.xml b/java/tools/pom.xml
index a8447c1..b534373 100644
--- a/java/tools/pom.xml
+++ b/java/tools/pom.xml
@@ -115,25 +115,59 @@
<artifactId>maven-compiler-plugin</artifactId>
</plugin>
<plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- <version>${maven-assembly-plugin.version}</version>
- <configuration>
- <archive>
- <manifest>
- <mainClass>org.apache.orc.tools.Driver</mainClass>
- </manifest>
- </archive>
- <descriptors>
- <descriptor>src/assembly/uber.xml</descriptor>
- </descriptors>
- </configuration>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>${maven-shade-plugin.version}</version>
<executions>
<execution>
- <id>make-assembly</id> <!-- this is used for inheritance merges -->
- <phase>package</phase> <!-- bind to the packaging phase -->
+ <phase>package</phase>
<goals>
- <goal>single</goal>
+ <goal>shade</goal>
</goals>
+ <configuration>
+ <artifactSet>
+ <includes>
+ <include>*:*</include>
+ </includes>
+ </artifactSet>
+ <transformers>
+ <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+ <mainClass>org.apache.orc.tools.Driver</mainClass>
+ </transformer>
+ </transformers>
+ <shadedArtifactAttached>true</shadedArtifactAttached>
+ <shadedClassifierName>uber</shadedClassifierName>
+ <relocations>
+ <relocation>
+ <pattern>com.google.protobuf</pattern>
+ <shadedPattern>com.google.protobuf25</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hadoop.hive</pattern>
+ <shadedPattern>org.apache.orc.storage</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.hive</pattern>
+ <shadedPattern>org.apache.orc.storage</shadedPattern>
+ </relocation>
+ <relocation>
+ <pattern>org.apache.commons</pattern>
+ <shadedPattern>org.apache.orc.shade.commons</shadedPattern>
+ </relocation>
+ </relocations>
+ <filters>
+ <filter>
+ <artifact>*:*</artifact>
+ <excludes>
+ <exclude>module-info.class</exclude>
+ <exclude>META-INF/MANIFEST.MF</exclude>
+ <exclude>META-INF/DEPENDENCIES</exclude>
+ <exclude>META-INF/LICENSE</exclude>
+ <exclude>META-INF/NOTICE</exclude>
+ </excludes>
+ </filter>
+ </filters>
+ </configuration>
</execution>
</executions>
</plugin>
diff --git a/java/tools/src/findbugs/exclude.xml b/java/tools/src/findbugs/exclude.xml
index 813a8e0..555068b 100644
--- a/java/tools/src/findbugs/exclude.xml
+++ b/java/tools/src/findbugs/exclude.xml
@@ -19,17 +19,7 @@
<!-- Java's try with resources causes a false positive.
See https://github.com/SERG-Delft/jpacman/pull/27 . -->
<Match>
- <Bug pattern="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE"/>
- <Class name="~org\.apache\.orc\.tools\.(ScanData|PrintVersion)"/>
- <Method name="main"/>
- </Match>
- <Match>
- <Bug pattern="RCN_REDUNDANT_NULLCHECK_WOULD_HAVE_BEEN_A_NPE"/>
- <Class name="~org\.apache\.orc.*\.Test.*"/>
- </Match>
- <Match>
- <Bug pattern="RCN_REDUNDANT_NULLCHECK_WOULD_HAVE_BEEN_A_NPE"/>
- <Class name="~org\.apache\.orc.tools.(RowCount|ScanData)"/>
+ <Bug pattern="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE,RCN_REDUNDANT_NULLCHECK_WOULD_HAVE_BEEN_A_NPE"/>
</Match>
<Match>
<Bug pattern="REC_CATCH_EXCEPTION"/>
diff --git a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
new file mode 100644
index 0000000..49dbc2b
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.orc.ColumnStatistics;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.StripeInformation;
+import org.apache.orc.TypeDescription;
+
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Given a set of paths, finds all of the "*.orc" files under them and
+ * prints the sizes of each column, both as a percentage and the number of
+ * bytes per a row.
+ */
+public class ColumnSizes {
+ final Configuration conf;
+ final TypeDescription schema;
+ final long[] columnSizes;
+ int goodFiles = 0;
+ long rows = 0;
+ long padding = 0;
+ long totalSize = 0;
+ long stripeFooterSize = 0;
+ long fileFooterSize = 0;
+ long stripeIndex = 0;
+ // data bytes that aren't assigned to a specific column
+ long stripeData = 0;
+
+ public ColumnSizes(Configuration conf,
+ LocatedFileStatus file) throws IOException {
+ this.conf = conf;
+ try (Reader reader = OrcFile.createReader(file.getPath(),
+ OrcFile.readerOptions(conf))) {
+ this.schema = reader.getSchema();
+ columnSizes = new long[schema.getMaximumId() + 1];
+ addReader(file, reader);
+ }
+ }
+
+ private void checkStripes(LocatedFileStatus file,
+ Reader reader) {
+ // Count the magic as file overhead
+ long offset = OrcFile.MAGIC.length();
+ fileFooterSize += offset;
+
+ for (StripeInformation stripe: reader.getStripes()) {
+ padding += stripe.getOffset() - offset;
+ stripeIndex += stripe.getIndexLength();
+ stripeData += stripe.getDataLength();
+ stripeFooterSize += stripe.getFooterLength();
+ offset = stripe.getOffset() + stripe.getLength();
+ }
+ // Add everything else as the file footer
+ fileFooterSize += file.getLen() - offset;
+ }
+
+ private boolean addReader(LocatedFileStatus file,
+ Reader reader) {
+ // Validate that the schemas are the same
+ TypeDescription newSchema = reader.getSchema();
+ if (schema.equals(newSchema)) {
+ goodFiles += 1;
+ rows += reader.getNumberOfRows();
+ totalSize += file.getLen();
+ checkStripes(file, reader);
+ ColumnStatistics[] colStats = reader.getStatistics();
+ for (int c = 0; c < colStats.length && c < columnSizes.length; c++) {
+ columnSizes[c] += colStats[c].getBytesOnDisk();
+ // Don't double count. Either count the bytes as stripe data or as
+ // part of a column.
+ stripeData -= colStats[c].getBytesOnDisk();
+ }
+ } else {
+ System.err.println("Ignoring " + file.getPath()
+ + " because of schema mismatch: " + newSchema);
+ return false;
+ }
+ return true;
+ }
+
+ public boolean addFile(LocatedFileStatus file) throws IOException {
+ try (Reader reader = OrcFile.createReader(file.getPath(),
+ OrcFile.readerOptions(conf))) {
+ return addReader(file, reader);
+ }
+ }
+
+ private static class StringLongPair {
+ final String name;
+ final long size;
+ StringLongPair(String name, long size) {
+ this.name = name;
+ this.size = size;
+ }
+ }
+
+ private void printResults(PrintStream out) {
+ List<StringLongPair> sizes = new ArrayList<>(columnSizes.length + 5);
+ for(int column = 0; column < columnSizes.length; ++column) {
+ if (columnSizes[column] > 0) {
+ sizes.add(new StringLongPair(
+ schema.findSubtype(column).getFullFieldName(),
+ columnSizes[column]));
+ }
+ }
+ if (padding > 0) {
+ sizes.add(new StringLongPair("_padding", padding));
+ }
+ if (stripeFooterSize > 0) {
+ sizes.add(new StringLongPair("_stripe_footer", stripeFooterSize));
+ }
+ if (fileFooterSize > 0) {
+ sizes.add(new StringLongPair("_file_footer", fileFooterSize));
+ }
+ if (stripeIndex > 0) {
+ sizes.add(new StringLongPair("_index", stripeIndex));
+ }
+ if (stripeData > 0) {
+ sizes.add(new StringLongPair("_data", stripeData));
+ }
+ // sort by descending size, ascending name
+ sizes.sort((x, y) -> x.size != y.size ?
+ Long.compare(y.size, x.size) : x.name.compareTo(y.name));
+ out.println("Percent Bytes/Row Name");
+ for (StringLongPair item: sizes) {
+ out.println(String.format(" %-5.2f %-9.2f %s",
+ 100.0 * item.size / totalSize, (double) item.size / rows, item.name));
+ }
+ }
+
+ public static void main(Configuration conf, String[] args) throws IOException {
+ ColumnSizes result = null;
+ int badFiles = 0;
+ for(String root: args) {
+ Path rootPath = new Path(root);
+ FileSystem fs = rootPath.getFileSystem(conf);
+ for(RemoteIterator<LocatedFileStatus> itr = fs.listFiles(rootPath, true); itr.hasNext(); ) {
+ LocatedFileStatus status = itr.next();
+ if (status.isFile() && status.getPath().getName().endsWith(".orc")) {
+ try {
+ if (result == null) {
+ result = new ColumnSizes(conf, status);
+ } else {
+ if (!result.addFile(status)) {
+ badFiles += 1;
+ }
+ }
+ } catch (IOException err) {
+ badFiles += 1;
+ System.err.println("Failed to read " + status.getPath());
+ }
+ }
+ }
+ }
+ if (result == null) {
+ System.err.println("No files found");
+ } else {
+ result.printResults(System.out);
+ }
+ if (badFiles > 0) {
+ System.err.println(badFiles + " bad ORC files found.");
+ System.exit(1);
+ }
+ }
+
+ public static void main(String[] args) throws IOException {
+ main(new Configuration(), args);
+ }
+}
diff --git a/java/tools/src/java/org/apache/orc/tools/Driver.java b/java/tools/src/java/org/apache/orc/tools/Driver.java
index 4be6d8c..eead10c 100644
--- a/java/tools/src/java/org/apache/orc/tools/Driver.java
+++ b/java/tools/src/java/org/apache/orc/tools/Driver.java
@@ -93,6 +93,7 @@ public class Driver {
System.err.println(" key - print information about the keys");
System.err.println(" meta - print the metadata about the ORC file");
System.err.println(" scan - scan the ORC file");
+ System.err.println(" sizes - list size on disk of each column");
System.err.println(" version - print the version of this ORC tool");
System.err.println();
System.err.println("To get more help, provide -h to the command");
@@ -125,6 +126,9 @@ public class Driver {
case "scan":
ScanData.main(conf, options.commandArgs);
break;
+ case "sizes":
+ ColumnSizes.main(conf, options.commandArgs);
+ break;
case "version":
PrintVersion.main(conf, options.commandArgs);
break;