You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by ga...@apache.org on 2021/03/19 10:21:44 UTC

[parquet-mr] branch master updated: PARQUET-1978: Provide a tool to show the complete footer (#867)

This is an automated email from the ASF dual-hosted git repository.

gabor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git


The following commit(s) were added to refs/heads/master by this push:
     new 5608695  PARQUET-1978: Provide a tool to show the complete footer (#867)
5608695 is described below

commit 5608695f5777de1eb0899d9075ec9411cfdf31d3
Author: Gabor Szadovszky <ga...@apache.org>
AuthorDate: Fri Mar 19 11:21:36 2021 +0100

    PARQUET-1978: Provide a tool to show the complete footer (#867)
---
 .../src/main/java/org/apache/parquet/cli/Main.java |   2 +
 .../parquet/cli/commands/ShowFooterCommand.java    | 144 +++++++++++++++++++++
 .../cli/commands/ShowFooterCommandTest.java        |  43 ++++++
 .../java/org/apache/parquet/format/CliUtils.java   |  55 ++++++++
 4 files changed, 244 insertions(+)

diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
index 1b52a1c..d656476 100644
--- a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
@@ -37,6 +37,7 @@ import org.apache.parquet.cli.commands.PruneColumnsCommand;
 import org.apache.parquet.cli.commands.SchemaCommand;
 import org.apache.parquet.cli.commands.ShowColumnIndexCommand;
 import org.apache.parquet.cli.commands.ShowDictionaryCommand;
+import org.apache.parquet.cli.commands.ShowFooterCommand;
 import org.apache.parquet.cli.commands.ShowPagesCommand;
 import org.apache.parquet.cli.commands.ToAvroCommand;
 import org.apache.commons.logging.LogFactory;
@@ -97,6 +98,7 @@ public class Main extends Configured implements Tool {
     jc.addCommand("prune", new PruneColumnsCommand(console));
     jc.addCommand("trans-compression", new TransCompressionCommand(console));
     jc.addCommand("masking", new ColumnMaskingCommand(console));
+    jc.addCommand("footer", new ShowFooterCommand(console));
   }
 
   @Override
diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowFooterCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowFooterCommand.java
new file mode 100644
index 0000000..a5a5c1f
--- /dev/null
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowFooterCommand.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.cli.commands;
+
+import static org.apache.parquet.bytes.BytesUtils.readIntLittleEndian;
+import static org.apache.parquet.hadoop.ParquetFileWriter.EFMAGIC;
+import static org.apache.parquet.hadoop.ParquetFileWriter.MAGIC;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.cli.BaseCommand;
+import org.apache.parquet.format.CliUtils;
+import org.apache.parquet.format.Util;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.io.InputFile;
+import org.apache.parquet.io.SeekableInputStream;
+import org.slf4j.Logger;
+
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.Parameters;
+import com.fasterxml.jackson.annotation.JsonAutoDetect.Visibility;
+import com.fasterxml.jackson.annotation.PropertyAccessor;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.SerializationFeature;
+
+@Parameters(commandDescription = "Print the Parquet file footer in json format")
+public class ShowFooterCommand extends BaseCommand {
+
+  public ShowFooterCommand(Logger console) {
+    super(console);
+  }
+
+  @Parameter(description = "<parquet path>", required = true)
+  String target;
+
+  @Parameter(names = { "-r", "--raw" }, description = "Print the raw thrift object of the footer")
+  boolean raw = false;
+
+  @Override
+  public int run() throws IOException {
+    InputFile inputFile = HadoopInputFile.fromPath(qualifiedPath(target), getConf());
+
+    console.info(raw ? readRawFooter(inputFile) : readFooter(inputFile));
+
+    return 0;
+  }
+
+  private String readFooter(InputFile inputFile) throws JsonProcessingException, IOException {
+    String json;
+    try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) {
+      ParquetMetadata footer = reader.getFooter();
+      ObjectMapper mapper = createObjectMapper();
+      mapper.setVisibility(PropertyAccessor.ALL, Visibility.NONE);
+      mapper.setVisibility(PropertyAccessor.FIELD, Visibility.ANY);
+      json = mapper.writerWithDefaultPrettyPrinter().writeValueAsString(footer);
+    }
+    return json;
+  }
+
+  private ObjectMapper createObjectMapper() {
+    ObjectMapper mapper = new ObjectMapper();
+    mapper.configure(SerializationFeature.FAIL_ON_EMPTY_BEANS, false);
+    mapper.configure(SerializationFeature.ORDER_MAP_ENTRIES_BY_KEYS, true);
+    return mapper;
+  }
+
+  private String readRawFooter(InputFile file) throws IOException {
+    long fileLen = file.getLength();
+
+    int FOOTER_LENGTH_SIZE = 4;
+    if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC
+      throw new RuntimeException("Not a Parquet file (length is too low: " + fileLen + ")");
+    }
+
+    try (SeekableInputStream f = file.newStream()) {
+      // Read footer length and magic string - with a single seek
+      byte[] magic = new byte[MAGIC.length];
+      long fileMetadataLengthIndex = fileLen - magic.length - FOOTER_LENGTH_SIZE;
+      f.seek(fileMetadataLengthIndex);
+      int fileMetadataLength = readIntLittleEndian(f);
+      f.readFully(magic);
+
+      if (Arrays.equals(EFMAGIC, magic)) {
+        throw new RuntimeException("Parquet files with encrypted footers are not supported.");
+      } else if (!Arrays.equals(MAGIC, magic)) {
+        throw new RuntimeException(
+            "Not a Parquet file (expected magic number at tail, but found " + Arrays.toString(magic) + ')');
+      }
+
+      long fileMetadataIndex = fileMetadataLengthIndex - fileMetadataLength;
+      if (fileMetadataIndex < magic.length || fileMetadataIndex >= fileMetadataLengthIndex) {
+        throw new RuntimeException("Corrupted file: the footer index is not within the file: " + fileMetadataIndex);
+      }
+      f.seek(fileMetadataIndex);
+
+      ByteBuffer footerBytesBuffer = ByteBuffer.allocate(fileMetadataLength);
+      f.readFully(footerBytesBuffer);
+      footerBytesBuffer.flip();
+      InputStream footerBytesStream = ByteBufferInputStream.wrap(footerBytesBuffer);
+      return prettify(CliUtils.toJson(Util.readFileMetaData(footerBytesStream)));
+    }
+  }
+
+  private String prettify(String json) throws JsonProcessingException {
+    ObjectMapper mapper = createObjectMapper();
+    Object obj = mapper.readValue(json, Object.class);
+    return mapper.writerWithDefaultPrettyPrinter().writeValueAsString(obj);
+  }
+
+  @Override
+  public List<String> getExamples() {
+    return Arrays.asList(
+        "# Print the parquet-mr interpreted footer of the specified Parquet file in json format",
+        "sample.parquet",
+        "# Print the raw thrift footer object of the specified Parquet file in json format",
+        "sample.parquet --raw");
+  }
+
+}
diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowFooterCommandTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowFooterCommandTest.java
new file mode 100644
index 0000000..61a598a
--- /dev/null
+++ b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowFooterCommandTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.cli.commands;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Test;
+
+public class ShowFooterCommandTest extends ParquetFileTest {
+  @Test
+  public void testShowDirectoryCommand() throws IOException {
+    File file = parquetFile();
+    ShowFooterCommand command = new ShowFooterCommand(createLogger());
+    command.target = file.getAbsolutePath();
+    command.raw = false;
+    command.setConf(new Configuration());
+    assertEquals(0, command.run());
+
+    command.raw = true;
+    assertEquals(0, command.run());
+  }
+}
diff --git a/parquet-format-structures/src/main/java/org/apache/parquet/format/CliUtils.java b/parquet-format-structures/src/main/java/org/apache/parquet/format/CliUtils.java
new file mode 100644
index 0000000..3739b64
--- /dev/null
+++ b/parquet-format-structures/src/main/java/org/apache/parquet/format/CliUtils.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.format;
+
+import java.io.IOException;
+
+import org.apache.thrift.TBase;
+import org.apache.thrift.TException;
+import org.apache.thrift.TSerializer;
+import org.apache.thrift.protocol.TSimpleJSONProtocol;
+
+/**
+ * Utility class for parquet-cli. This is required because this module shades thriftlib which means we cannot use Thrift
+ * classes outside of this module without adding thriftlib as a separate dependency.
+ */
+public class CliUtils {
+
+  /**
+   * Returns the json representation of the specified Thrift object
+   *
+   * @param tbase the thrift object to be serialized as a json
+   * @return the json representation of the Thrift object as a String
+   * @throws IOException if any Thrift error occurs during the serialization
+   */
+  public static String toJson(TBase<?, ?> tbase) throws IOException {
+    TSerializer serializer = new TSerializer(new TSimpleJSONProtocol.Factory());
+    try {
+      return serializer.toString(tbase);
+    } catch (TException e) {
+      // Wrapping the exception the not to expose the shaded Thrift class TException
+      throw new IOException(e);
+    }
+  }
+
+  private CliUtils() {
+    // private constructor to avoid instantiation
+  }
+}