You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by ga...@apache.org on 2021/03/19 10:21:44 UTC
[parquet-mr] branch master updated: PARQUET-1978: Provide a tool to
show the complete footer (#867)
This is an automated email from the ASF dual-hosted git repository.
gabor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git
The following commit(s) were added to refs/heads/master by this push:
new 5608695 PARQUET-1978: Provide a tool to show the complete footer (#867)
5608695 is described below
commit 5608695f5777de1eb0899d9075ec9411cfdf31d3
Author: Gabor Szadovszky <ga...@apache.org>
AuthorDate: Fri Mar 19 11:21:36 2021 +0100
PARQUET-1978: Provide a tool to show the complete footer (#867)
---
.../src/main/java/org/apache/parquet/cli/Main.java | 2 +
.../parquet/cli/commands/ShowFooterCommand.java | 144 +++++++++++++++++++++
.../cli/commands/ShowFooterCommandTest.java | 43 ++++++
.../java/org/apache/parquet/format/CliUtils.java | 55 ++++++++
4 files changed, 244 insertions(+)
diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
index 1b52a1c..d656476 100644
--- a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
@@ -37,6 +37,7 @@ import org.apache.parquet.cli.commands.PruneColumnsCommand;
import org.apache.parquet.cli.commands.SchemaCommand;
import org.apache.parquet.cli.commands.ShowColumnIndexCommand;
import org.apache.parquet.cli.commands.ShowDictionaryCommand;
+import org.apache.parquet.cli.commands.ShowFooterCommand;
import org.apache.parquet.cli.commands.ShowPagesCommand;
import org.apache.parquet.cli.commands.ToAvroCommand;
import org.apache.commons.logging.LogFactory;
@@ -97,6 +98,7 @@ public class Main extends Configured implements Tool {
jc.addCommand("prune", new PruneColumnsCommand(console));
jc.addCommand("trans-compression", new TransCompressionCommand(console));
jc.addCommand("masking", new ColumnMaskingCommand(console));
+ jc.addCommand("footer", new ShowFooterCommand(console));
}
@Override
diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowFooterCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowFooterCommand.java
new file mode 100644
index 0000000..a5a5c1f
--- /dev/null
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowFooterCommand.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.cli.commands;
+
+import static org.apache.parquet.bytes.BytesUtils.readIntLittleEndian;
+import static org.apache.parquet.hadoop.ParquetFileWriter.EFMAGIC;
+import static org.apache.parquet.hadoop.ParquetFileWriter.MAGIC;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.cli.BaseCommand;
+import org.apache.parquet.format.CliUtils;
+import org.apache.parquet.format.Util;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.io.InputFile;
+import org.apache.parquet.io.SeekableInputStream;
+import org.slf4j.Logger;
+
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.Parameters;
+import com.fasterxml.jackson.annotation.JsonAutoDetect.Visibility;
+import com.fasterxml.jackson.annotation.PropertyAccessor;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.SerializationFeature;
+
+@Parameters(commandDescription = "Print the Parquet file footer in json format")
+public class ShowFooterCommand extends BaseCommand {
+
+ public ShowFooterCommand(Logger console) {
+ super(console);
+ }
+
+ @Parameter(description = "<parquet path>", required = true)
+ String target;
+
+ @Parameter(names = { "-r", "--raw" }, description = "Print the raw thrift object of the footer")
+ boolean raw = false;
+
+ @Override
+ public int run() throws IOException {
+ InputFile inputFile = HadoopInputFile.fromPath(qualifiedPath(target), getConf());
+
+ console.info(raw ? readRawFooter(inputFile) : readFooter(inputFile));
+
+ return 0;
+ }
+
+ private String readFooter(InputFile inputFile) throws JsonProcessingException, IOException {
+ String json;
+ try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) {
+ ParquetMetadata footer = reader.getFooter();
+ ObjectMapper mapper = createObjectMapper();
+ mapper.setVisibility(PropertyAccessor.ALL, Visibility.NONE);
+ mapper.setVisibility(PropertyAccessor.FIELD, Visibility.ANY);
+ json = mapper.writerWithDefaultPrettyPrinter().writeValueAsString(footer);
+ }
+ return json;
+ }
+
+ private ObjectMapper createObjectMapper() {
+ ObjectMapper mapper = new ObjectMapper();
+ mapper.configure(SerializationFeature.FAIL_ON_EMPTY_BEANS, false);
+ mapper.configure(SerializationFeature.ORDER_MAP_ENTRIES_BY_KEYS, true);
+ return mapper;
+ }
+
+ private String readRawFooter(InputFile file) throws IOException {
+ long fileLen = file.getLength();
+
+ int FOOTER_LENGTH_SIZE = 4;
+ if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC
+ throw new RuntimeException("Not a Parquet file (length is too low: " + fileLen + ")");
+ }
+
+ try (SeekableInputStream f = file.newStream()) {
+ // Read footer length and magic string - with a single seek
+ byte[] magic = new byte[MAGIC.length];
+ long fileMetadataLengthIndex = fileLen - magic.length - FOOTER_LENGTH_SIZE;
+ f.seek(fileMetadataLengthIndex);
+ int fileMetadataLength = readIntLittleEndian(f);
+ f.readFully(magic);
+
+ if (Arrays.equals(EFMAGIC, magic)) {
+ throw new RuntimeException("Parquet files with encrypted footers are not supported.");
+ } else if (!Arrays.equals(MAGIC, magic)) {
+ throw new RuntimeException(
+ "Not a Parquet file (expected magic number at tail, but found " + Arrays.toString(magic) + ')');
+ }
+
+ long fileMetadataIndex = fileMetadataLengthIndex - fileMetadataLength;
+ if (fileMetadataIndex < magic.length || fileMetadataIndex >= fileMetadataLengthIndex) {
+ throw new RuntimeException("Corrupted file: the footer index is not within the file: " + fileMetadataIndex);
+ }
+ f.seek(fileMetadataIndex);
+
+ ByteBuffer footerBytesBuffer = ByteBuffer.allocate(fileMetadataLength);
+ f.readFully(footerBytesBuffer);
+ footerBytesBuffer.flip();
+ InputStream footerBytesStream = ByteBufferInputStream.wrap(footerBytesBuffer);
+ return prettify(CliUtils.toJson(Util.readFileMetaData(footerBytesStream)));
+ }
+ }
+
+ private String prettify(String json) throws JsonProcessingException {
+ ObjectMapper mapper = createObjectMapper();
+ Object obj = mapper.readValue(json, Object.class);
+ return mapper.writerWithDefaultPrettyPrinter().writeValueAsString(obj);
+ }
+
+ @Override
+ public List<String> getExamples() {
+ return Arrays.asList(
+ "# Print the parquet-mr interpreted footer of the specified Parquet file in json format",
+ "sample.parquet",
+ "# Print the raw thrift footer object of the specified Parquet file in json format",
+ "sample.parquet --raw");
+ }
+
+}
diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowFooterCommandTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowFooterCommandTest.java
new file mode 100644
index 0000000..61a598a
--- /dev/null
+++ b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowFooterCommandTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.cli.commands;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Test;
+
+public class ShowFooterCommandTest extends ParquetFileTest {
+ @Test
+ public void testShowDirectoryCommand() throws IOException {
+ File file = parquetFile();
+ ShowFooterCommand command = new ShowFooterCommand(createLogger());
+ command.target = file.getAbsolutePath();
+ command.raw = false;
+ command.setConf(new Configuration());
+ assertEquals(0, command.run());
+
+ command.raw = true;
+ assertEquals(0, command.run());
+ }
+}
diff --git a/parquet-format-structures/src/main/java/org/apache/parquet/format/CliUtils.java b/parquet-format-structures/src/main/java/org/apache/parquet/format/CliUtils.java
new file mode 100644
index 0000000..3739b64
--- /dev/null
+++ b/parquet-format-structures/src/main/java/org/apache/parquet/format/CliUtils.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.format;
+
+import java.io.IOException;
+
+import org.apache.thrift.TBase;
+import org.apache.thrift.TException;
+import org.apache.thrift.TSerializer;
+import org.apache.thrift.protocol.TSimpleJSONProtocol;
+
+/**
+ * Utility class for parquet-cli. This is required because this module shades thriftlib which means we cannot use Thrift
+ * classes outside of this module without adding thriftlib as a separate dependency.
+ */
+public class CliUtils {
+
+ /**
+ * Returns the json representation of the specified Thrift object
+ *
+ * @param tbase the thrift object to be serialized as a json
+ * @return the json representation of the Thrift object as a String
+ * @throws IOException if any Thrift error occurs during the serialization
+ */
+ public static String toJson(TBase<?, ?> tbase) throws IOException {
+ TSerializer serializer = new TSerializer(new TSimpleJSONProtocol.Factory());
+ try {
+ return serializer.toString(tbase);
+ } catch (TException e) {
+ // Wrapping the exception the not to expose the shaded Thrift class TException
+ throw new IOException(e);
+ }
+ }
+
+ private CliUtils() {
+ // private constructor to avoid instantiation
+ }
+}