You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/12/16 00:00:22 UTC
[orc] branch branch-1.7 updated: ORC-1017: Add sizes tool to determine and display the sizes of each column in a set of files. (#925)

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-1.7
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/branch-1.7 by this push:
     new 0fbfa66  ORC-1017: Add sizes tool to determine and display the sizes of each column in a set of files. (#925)
0fbfa66 is described below

commit 0fbfa6603b1dc61c7a8b4553ddf7e0158e42fdd5
Author: Owen O'Malley <oo...@linkedin.com>
AuthorDate: Mon Oct 4 23:17:20 2021 -0700

    ORC-1017: Add sizes tool to determine and display the sizes of each column in a set of files. (#925)
    
    ### What changes were proposed in this pull request?
    
    This patch adds a new tool that accounts for the total size of a set of ORC files. For files written by >= ORC 1.5, you'll get a column breakdown of the file. There are some virtual columns that are included:
    - _index the indexes that are used for skipping inside the stripe
    - _data the data in files written prior to ORC 1.5
    - _stripe_footer the stripe metadata
    - _file_footer the file metadata
    - _padding padding added to align stripes to HDFS block boundaries
    
    I also added a new method on TypeDescription that gets the full field name, which is the inverse of findSubtype.
    
    ### Why are the changes needed?
    
    The tool helps diagnose the compression of a set of files.
    
    ### How was this patch tested?
    
    I added a test of the new TypeDescription.getFullFieldName. I ran the tool over some of the examples and some multiple-terabyte directories of production ORC files.
    
    (cherry picked from commit be0762b67b04b9b4592b8a6364addd447c704fc2)
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
 .../src/java/org/apache/orc/TypeDescription.java   |  67 +++++++
 .../test/org/apache/orc/TestTypeDescription.java   |  19 ++
 java/tools/pom.xml                                 |  64 +++++--
 java/tools/src/findbugs/exclude.xml                |  12 +-
 .../src/java/org/apache/orc/tools/ColumnSizes.java | 196 +++++++++++++++++++++
 .../src/java/org/apache/orc/tools/Driver.java      |   4 +
 6 files changed, 336 insertions(+), 26 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java
index 6e3b05f..10520a8 100644
--- a/java/core/src/java/org/apache/orc/TypeDescription.java
+++ b/java/core/src/java/org/apache/orc/TypeDescription.java
@@ -862,4 +862,71 @@ public class TypeDescription
           + source);
     }
   }
+
+  /**
+   * Find the index of a given child object using == comparison.
+   * @param child The child type
+   * @return the index 0 to N-1 of the children.
+   */
+  private int getChildIndex(TypeDescription child) {
+    for(int i=children.size() - 1; i >= 0; --i) {
+      if (children.get(i) == child) {
+        return i;
+      }
+    }
+    throw new IllegalArgumentException("Child not found");
+  }
+
+  /**
+   * For a complex type, get the partial name for this child. For structures,
+   * it returns the corresponding field name. For lists and maps, it uses the
+   * special names "_elem", "_key", and "_value". Unions use the integer index.
+   * @param child The desired child, which must be the same object (==)
+   * @return The name of the field for the given child.
+   */
+  private String getPartialName(TypeDescription child) {
+    switch (category) {
+      case LIST:
+        return "_elem";
+      case MAP:
+        return getChildIndex(child) == 0 ? "_key" : "_value";
+      case STRUCT:
+        return fieldNames.get(getChildIndex(child));
+      case UNION:
+        return Integer.toString(getChildIndex(child));
+      default:
+        throw new IllegalArgumentException(
+            "Can't get the field name of a primitive type");
+    }
+  }
+
+  /**
+   * Get the full field name for the given type. For
+   * "struct&lt;a:struct&lt;list&lt;struct&lt;b:int,c:int&gt;&gt;&gt;&gt;" when
+   * called on c, would return "a._elem.c".
+   * @return A string that is the inverse of findSubtype
+   */
+  public String getFullFieldName() {
+    List<String> parts = new ArrayList<>(getId());
+    TypeDescription current = this;
+    TypeDescription parent = current.getParent();
+    // Handle the root as a special case so that it isn't an empty string.
+    if (parent == null) {
+      return Integer.toString(current.getId());
+    }
+    while (parent != null) {
+      parts.add(parent.getPartialName(current));
+      current = parent;
+      parent = current.getParent();
+    }
+    // Put the string together backwards
+    StringBuilder buffer = new StringBuilder();
+    for (int part=parts.size() - 1; part >= 0; --part) {
+      buffer.append(parts.get(part));
+      if (part != 0) {
+        buffer.append('.');
+      }
+    }
+    return buffer.toString();
+  }
 }
diff --git a/java/core/src/test/org/apache/orc/TestTypeDescription.java b/java/core/src/test/org/apache/orc/TestTypeDescription.java
index 80e3b77..9203eaf 100644
--- a/java/core/src/test/org/apache/orc/TestTypeDescription.java
+++ b/java/core/src/test/org/apache/orc/TestTypeDescription.java
@@ -490,4 +490,23 @@ public class TestTypeDescription {
     assertThrows(IllegalArgumentException.class, () ->
         schema.annotateEncryption(null,"nullify:name;sha256:name"));
   }
+
+  @Test
+  public void testGetFullFieldName() {
+    TypeDescription schema = TypeDescription.fromString(
+        "struct<" +
+            "name:struct<first:string,last:string>," +
+            "address:struct<street:string,city:string,country:string,post_code:string>," +
+            "credit_cards:array<struct<card_number:string,expire:date,ccv:string>>," +
+            "properties:map<string,uniontype<int,string>>>");
+    for (String column: new String[]{"0", "name", "name.first", "name.last",
+                                     "address.street", "address.city",
+                                     "credit_cards", "credit_cards._elem",
+                                     "credit_cards._elem.card_number",
+                                     "properties", "properties._key", "properties._value",
+                                     "properties._value.0", "properties._value.1"}) {
+      assertEquals(column,
+          schema.findSubtype(column, true).getFullFieldName());
+    }
+  }
 }
diff --git a/java/tools/pom.xml b/java/tools/pom.xml
index a8447c1..b534373 100644
--- a/java/tools/pom.xml
+++ b/java/tools/pom.xml
@@ -115,25 +115,59 @@
         <artifactId>maven-compiler-plugin</artifactId>
       </plugin>
       <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-        <version>${maven-assembly-plugin.version}</version>
-        <configuration>
-          <archive>
-            <manifest>
-              <mainClass>org.apache.orc.tools.Driver</mainClass>
-            </manifest>
-          </archive>
-          <descriptors>
-            <descriptor>src/assembly/uber.xml</descriptor>
-          </descriptors>
-        </configuration>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <version>${maven-shade-plugin.version}</version>
         <executions>
           <execution>
-            <id>make-assembly</id> <!-- this is used for inheritance merges -->
-            <phase>package</phase> <!-- bind to the packaging phase -->
+            <phase>package</phase>
             <goals>
-              <goal>single</goal>
+              <goal>shade</goal>
             </goals>
+            <configuration>
+              <artifactSet>
+                <includes>
+                  <include>*:*</include>
+                </includes>
+              </artifactSet>
+              <transformers>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                  <mainClass>org.apache.orc.tools.Driver</mainClass>
+                </transformer>
+              </transformers>
+              <shadedArtifactAttached>true</shadedArtifactAttached>
+              <shadedClassifierName>uber</shadedClassifierName>
+              <relocations>
+                <relocation>
+                  <pattern>com.google.protobuf</pattern>
+                  <shadedPattern>com.google.protobuf25</shadedPattern>
+                </relocation>
+                <relocation>
+                  <pattern>org.apache.hadoop.hive</pattern>
+                  <shadedPattern>org.apache.orc.storage</shadedPattern>
+                </relocation>
+                <relocation>
+                  <pattern>org.apache.hive</pattern>
+                  <shadedPattern>org.apache.orc.storage</shadedPattern>
+                </relocation>
+                <relocation>
+                  <pattern>org.apache.commons</pattern>
+                  <shadedPattern>org.apache.orc.shade.commons</shadedPattern>
+                </relocation>
+              </relocations>
+              <filters>
+                <filter>
+                  <artifact>*:*</artifact>
+                  <excludes>
+                    <exclude>module-info.class</exclude>
+                    <exclude>META-INF/MANIFEST.MF</exclude>
+                    <exclude>META-INF/DEPENDENCIES</exclude>
+                    <exclude>META-INF/LICENSE</exclude>
+                    <exclude>META-INF/NOTICE</exclude>
+                  </excludes>
+                </filter>
+              </filters>
+            </configuration>
           </execution>
         </executions>
       </plugin>
diff --git a/java/tools/src/findbugs/exclude.xml b/java/tools/src/findbugs/exclude.xml
index 813a8e0..555068b 100644
--- a/java/tools/src/findbugs/exclude.xml
+++ b/java/tools/src/findbugs/exclude.xml
@@ -19,17 +19,7 @@
   <!-- Java's try with resources causes a false positive.
        See https://github.com/SERG-Delft/jpacman/pull/27 . -->
   <Match>
-    <Bug pattern="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE"/>
-    <Class name="~org\.apache\.orc\.tools\.(ScanData|PrintVersion)"/>
-    <Method name="main"/>
-  </Match>
-  <Match>
-    <Bug pattern="RCN_REDUNDANT_NULLCHECK_WOULD_HAVE_BEEN_A_NPE"/>
-    <Class name="~org\.apache\.orc.*\.Test.*"/>
-  </Match>
-  <Match>
-    <Bug pattern="RCN_REDUNDANT_NULLCHECK_WOULD_HAVE_BEEN_A_NPE"/>
-    <Class name="~org\.apache\.orc.tools.(RowCount|ScanData)"/>
+    <Bug pattern="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE,RCN_REDUNDANT_NULLCHECK_WOULD_HAVE_BEEN_A_NPE"/>
   </Match>
   <Match>
     <Bug pattern="REC_CATCH_EXCEPTION"/>
diff --git a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
new file mode 100644
index 0000000..49dbc2b
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.orc.ColumnStatistics;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.StripeInformation;
+import org.apache.orc.TypeDescription;
+
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Given a set of paths, finds all of the "*.orc" files under them and
+ * prints the sizes of each column, both as a percentage and the number of
+ * bytes per a row.
+ */
+public class ColumnSizes {
+  final Configuration conf;
+  final TypeDescription schema;
+  final long[] columnSizes;
+  int goodFiles = 0;
+  long rows = 0;
+  long padding = 0;
+  long totalSize = 0;
+  long stripeFooterSize = 0;
+  long fileFooterSize = 0;
+  long stripeIndex = 0;
+  // data bytes that aren't assigned to a specific column
+  long stripeData = 0;
+
+  public ColumnSizes(Configuration conf,
+                     LocatedFileStatus file) throws IOException {
+    this.conf = conf;
+    try (Reader reader = OrcFile.createReader(file.getPath(),
+                                              OrcFile.readerOptions(conf))) {
+      this.schema = reader.getSchema();
+      columnSizes = new long[schema.getMaximumId() + 1];
+      addReader(file, reader);
+    }
+  }
+
+  private void checkStripes(LocatedFileStatus file,
+                            Reader reader) {
+    // Count the magic as file overhead
+    long offset = OrcFile.MAGIC.length();
+    fileFooterSize += offset;
+
+    for (StripeInformation stripe: reader.getStripes()) {
+      padding += stripe.getOffset() - offset;
+      stripeIndex += stripe.getIndexLength();
+      stripeData += stripe.getDataLength();
+      stripeFooterSize += stripe.getFooterLength();
+      offset = stripe.getOffset() + stripe.getLength();
+    }
+    // Add everything else as the file footer
+    fileFooterSize += file.getLen() - offset;
+  }
+
+  private boolean addReader(LocatedFileStatus file,
+                            Reader reader) {
+    // Validate that the schemas are the same
+    TypeDescription newSchema = reader.getSchema();
+    if (schema.equals(newSchema)) {
+      goodFiles += 1;
+      rows += reader.getNumberOfRows();
+      totalSize += file.getLen();
+      checkStripes(file, reader);
+      ColumnStatistics[] colStats = reader.getStatistics();
+      for (int c = 0; c < colStats.length && c < columnSizes.length; c++) {
+        columnSizes[c] += colStats[c].getBytesOnDisk();
+        // Don't double count. Either count the bytes as stripe data or as
+        // part of a column.
+        stripeData -= colStats[c].getBytesOnDisk();
+      }
+    } else {
+      System.err.println("Ignoring " + file.getPath()
+          + " because of schema mismatch: " + newSchema);
+      return false;
+    }
+    return true;
+  }
+
+  public boolean addFile(LocatedFileStatus file) throws IOException {
+    try (Reader reader = OrcFile.createReader(file.getPath(),
+        OrcFile.readerOptions(conf))) {
+      return addReader(file, reader);
+    }
+  }
+
+  private static class StringLongPair {
+    final String name;
+    final long size;
+    StringLongPair(String name, long size) {
+      this.name = name;
+      this.size = size;
+    }
+  }
+
+  private void printResults(PrintStream out) {
+    List<StringLongPair> sizes = new ArrayList<>(columnSizes.length + 5);
+    for(int column = 0; column < columnSizes.length; ++column) {
+      if (columnSizes[column] > 0) {
+        sizes.add(new StringLongPair(
+            schema.findSubtype(column).getFullFieldName(),
+            columnSizes[column]));
+      }
+    }
+    if (padding > 0) {
+      sizes.add(new StringLongPair("_padding", padding));
+    }
+    if (stripeFooterSize > 0) {
+      sizes.add(new StringLongPair("_stripe_footer", stripeFooterSize));
+    }
+    if (fileFooterSize > 0) {
+      sizes.add(new StringLongPair("_file_footer", fileFooterSize));
+    }
+    if (stripeIndex > 0) {
+      sizes.add(new StringLongPair("_index", stripeIndex));
+    }
+    if (stripeData > 0) {
+      sizes.add(new StringLongPair("_data", stripeData));
+    }
+    // sort by descending size, ascending name
+    sizes.sort((x, y) -> x.size != y.size ?
+        Long.compare(y.size, x.size) : x.name.compareTo(y.name));
+    out.println("Percent  Bytes/Row  Name");
+    for (StringLongPair item: sizes) {
+       out.println(String.format("  %-5.2f  %-9.2f  %s",
+           100.0 * item.size / totalSize, (double) item.size / rows, item.name));
+    }
+  }
+
+  public static void main(Configuration conf, String[] args) throws IOException {
+    ColumnSizes result = null;
+    int badFiles = 0;
+    for(String root: args) {
+      Path rootPath = new Path(root);
+      FileSystem fs = rootPath.getFileSystem(conf);
+      for(RemoteIterator<LocatedFileStatus> itr = fs.listFiles(rootPath, true); itr.hasNext(); ) {
+        LocatedFileStatus status = itr.next();
+        if (status.isFile() && status.getPath().getName().endsWith(".orc")) {
+          try {
+            if (result == null) {
+              result = new ColumnSizes(conf, status);
+            } else {
+              if (!result.addFile(status)) {
+                badFiles += 1;
+              }
+            }
+          } catch (IOException err) {
+            badFiles += 1;
+            System.err.println("Failed to read " + status.getPath());
+          }
+        }
+      }
+    }
+    if (result == null) {
+      System.err.println("No files found");
+    } else {
+      result.printResults(System.out);
+    }
+    if (badFiles > 0) {
+      System.err.println(badFiles + " bad ORC files found.");
+      System.exit(1);
+    }
+  }
+
+  public static void main(String[] args) throws IOException {
+    main(new Configuration(), args);
+  }
+}
diff --git a/java/tools/src/java/org/apache/orc/tools/Driver.java b/java/tools/src/java/org/apache/orc/tools/Driver.java
index 4be6d8c..eead10c 100644
--- a/java/tools/src/java/org/apache/orc/tools/Driver.java
+++ b/java/tools/src/java/org/apache/orc/tools/Driver.java
@@ -93,6 +93,7 @@ public class Driver {
       System.err.println("   key - print information about the keys");
       System.err.println("   meta - print the metadata about the ORC file");
       System.err.println("   scan - scan the ORC file");
+      System.err.println("   sizes - list size on disk of each column");
       System.err.println("   version - print the version of this ORC tool");
       System.err.println();
       System.err.println("To get more help, provide -h to the command");
@@ -125,6 +126,9 @@ public class Driver {
       case "scan":
         ScanData.main(conf, options.commandArgs);
         break;
+      case "sizes":
+        ColumnSizes.main(conf, options.commandArgs);
+        break;
       case "version":
         PrintVersion.main(conf, options.commandArgs);
         break;