You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by ga...@apache.org on 2019/09/24 10:22:46 UTC
[parquet-mr] branch master updated: PARQUET-1644: Clean up some
benchmark code and docs. (#672)
This is an automated email from the ASF dual-hosted git repository.
gabor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git
The following commit(s) were added to refs/heads/master by this push:
new 7c4d1ec PARQUET-1644: Clean up some benchmark code and docs. (#672)
7c4d1ec is described below
commit 7c4d1ec859d46bead9f8ed9446d2d5875082211d
Author: RyanSkraba <ry...@skraba.com>
AuthorDate: Tue Sep 24 12:22:40 2019 +0200
PARQUET-1644: Clean up some benchmark code and docs. (#672)
---
parquet-benchmarks/README.md | 36 ++++++--
parquet-benchmarks/run.sh | 96 ++++++++++++++++++++--
.../apache/parquet/benchmarks/BenchmarkFiles.java | 2 +
.../apache/parquet/benchmarks/DataGenerator.java | 9 +-
.../benchmarks/PageChecksumDataGenerator.java | 23 +-----
.../benchmarks/PageChecksumReadBenchmarks.java | 63 ++++++++------
.../benchmarks/PageChecksumWriteBenchmarks.java | 56 ++++++++-----
.../apache/parquet/benchmarks/ReadBenchmarks.java | 25 ++++++
.../apache/parquet/benchmarks/WriteBenchmarks.java | 11 ++-
.../main/resources/log4j.properties} | 14 ++--
10 files changed, 237 insertions(+), 98 deletions(-)
diff --git a/parquet-benchmarks/README.md b/parquet-benchmarks/README.md
index 8da067b..63101bd 100644
--- a/parquet-benchmarks/README.md
+++ b/parquet-benchmarks/README.md
@@ -17,22 +17,42 @@
~ under the License.
-->
-##Running Parquet Benchmarks
+# Running Parquet Benchmarks
-First, build the ``parquet-benchmarks`` module
+The Parquet benchmarks in this module are run using the
+[OpenJDK Java Microbenchmarking Harness](http://openjdk.java.net/projects/code-tools/jmh/).
+
+First, building the `parquet-benchmarks` module creates an uber-jar including the Parquet
+classes and all dependencies, and a main class to launch the JMH tool.
```
mvn --projects parquet-benchmarks -amd -DskipTests -Denforcer.skip=true clean package
```
-Then, you can run all the benchmarks with the following command
+JMH doesn't have the notion of "benchmark suites", but there are certain benchmarks that
+make sense to group together or to run in isolation during development. The
+`./parquet-benchmarks/run.sh` script can be used to launch all or some benchmarks:
```
-./parquet-benchmarks/run.sh -wi 5 -i 5 -f 3 -bm all
-```
+# More information about the run script and the available arguments.
+./parquet-benchmarks/run.sh
+
+# More information on the JMH options available.
+./parquet-benchmarks/run.sh all -help
+
+# Run every benchmark once (~20 minutes).
+./parquet-benchmarks/run.sh all -wi 0 -i 1 -f 1
-To understand what each command line argument means and for more arguments please see
+# A more rigourous run of all benchmarks, saving a report for comparison.
+./parquet-benchmarks/run.sh all -wi 5 -i 5 -f 3 -rff /tmp/benchmark1.json
+# Run a benchmark "suite" built into the script, with JMH defaults (about 30 minutes)
+./parquet-benchmarks/run.sh checksum
+
+# Running one specific benchmark using a regex.
+./parquet-benchmarks/run.sh all org.apache.parquet.benchmarks.NestedNullWritingBenchmarks
+
+# Manually clean up any state left behind from a previous run.
+./parquet-benchmarks/run.sh clean
```
-java -jar parquet-benchmarks/target/parquet-benchmarks.jar -help
-```
\ No newline at end of file
+
diff --git a/parquet-benchmarks/run.sh b/parquet-benchmarks/run.sh
index 8aa1e69..ba40766 100755
--- a/parquet-benchmarks/run.sh
+++ b/parquet-benchmarks/run.sh
@@ -20,11 +20,91 @@
SCRIPT_PATH=$( cd "$(dirname "$0")" ; pwd -P )
-echo "Starting WRITE benchmarks"
-java -jar ${SCRIPT_PATH}/target/parquet-benchmarks.jar p*Write* "$@"
-echo "Generating test data"
-java -cp ${SCRIPT_PATH}/target/parquet-benchmarks.jar org.apache.parquet.benchmarks.DataGenerator generate
-echo "Data generated, starting READ benchmarks"
-java -jar ${SCRIPT_PATH}/target/parquet-benchmarks.jar p*Read* "$@"
-echo "Cleaning up generated data"
-java -cp ${SCRIPT_PATH}/target/parquet-benchmarks.jar org.apache.parquet.benchmarks.DataGenerator cleanup
+BENCHMARK=$1; shift
+JMH_OPTIONS="$@"
+
+if [ -z "$BENCHMARK" ]; then
+
+ # Print usage if run without arguments.
+ cat << EOF
+Runs Parquet JMH-based benchmarks.
+
+Usage:
+ run.sh <BENCHMARK> [JMH_OPTIONS]
+
+Information on the JMH_OPTIONS can be found by running: run.sh all -help
+
+<BENCHMARK> | Description
+----------- | ----------
+all | Runs all benchmarks in the module (listed here and others).
+build | (No benchmark run, shortcut to rebuild the JMH uber jar).
+clean | (No benchmark run, shortcut to clean up any temporary files).
+read | Reading files with different compression, page and block sizes.
+write | Writing files.
+checksum | Reading and writing with and without CRC checksums.
+filter | Filtering column indexes
+
+Examples:
+
+# More information about the run script and the available arguments.
+./parquet-benchmarks/run.sh
+
+# More information on the JMH options available.
+./parquet-benchmarks/run.sh all -help
+
+# Run every benchmark once (~20 minutes).
+./parquet-benchmarks/run.sh all -wi 0 -i 1 -f 1
+
+# A more rigourous run of all benchmarks, saving a report for comparison.
+./parquet-benchmarks/run.sh all -wi 5 -i 5 -f 3 -rff /tmp/benchmark1.json
+
+# Run a benchmark "suite" built into the script, with JMH defaults (about 30 minutes)
+./parquet-benchmarks/run.sh checksum
+
+# Running one specific benchmark using a regex.
+./parquet-benchmarks/run.sh all org.apache.parquet.benchmarks.NestedNullWritingBenchmarks
+
+EOF
+
+elif [ "$BENCHMARK" == "build" ]; then
+
+ # Shortcut utility to rebuild the benchmark module only.
+ ( cd $SCRIPT_PATH && mvn -amd -DskipTests -Denforcer.skip=true clean package )
+
+elif [ "$BENCHMARK" == "clean" ]; then
+
+ # Shortcut utility to clean any state left behind from any previous run.
+ java -cp ${SCRIPT_PATH}/target/parquet-benchmarks.jar org.apache.parquet.benchmarks.DataGenerator cleanup
+
+else
+
+ # Actually run a benchmark in the JMH harness.
+
+ # Build the benchmark uberjar if it doesn't already exist.
+ if [ ! -f ${SCRIPT_PATH}/target/parquet-benchmarks.jar ]; then
+ ${SCRIPT_PATH}/run.sh build
+ fi
+
+ # Pick a regex if specified.
+ BENCHMARK_REGEX=""
+ case "$BENCHMARK" in
+ "read")
+ BENCHMARK_REGEX="org.apache.parquet.benchmarks.ReadBenchmarks"
+ ;;
+ "write")
+ BENCHMARK_REGEX="org.apache.parquet.benchmarks.WriteBenchmarks"
+ ;;
+ "checksum")
+ BENCHMARK_REGEX="org.apache.parquet.benchmarks.PageChecksum.*"
+ ;;
+ "filter")
+ BENCHMARK_REGEX="org.apache.parquet.benchmarks.FilteringBenchmarks"
+ ;;
+ esac
+
+ echo JMH command: java -jar ${SCRIPT_PATH}/target/parquet-benchmarks.jar $BENCHMARK_REGEX $JMH_OPTIONS
+ java -jar ${SCRIPT_PATH}/target/parquet-benchmarks.jar $BENCHMARK_REGEX $JMH_OPTIONS
+
+ # Clean any data files generated by the benchmarks.
+ ${SCRIPT_PATH}/run.sh clean
+fi
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkFiles.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkFiles.java
index f039403..24da822 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkFiles.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkFiles.java
@@ -25,6 +25,8 @@ public class BenchmarkFiles {
public static final Configuration configuration = new Configuration();
public static final String TARGET_DIR = "target/tests/ParquetBenchmarks";
+ public static final Path targetDir = new Path(TARGET_DIR );
+
public static final Path file_1M = new Path(TARGET_DIR + "/PARQUET-1M");
//different block and page sizes
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DataGenerator.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DataGenerator.java
index 42d9953..3b5db68 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DataGenerator.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DataGenerator.java
@@ -115,14 +115,7 @@ public class DataGenerator {
public void cleanup()
{
- deleteIfExists(configuration, file_1M);
- deleteIfExists(configuration, file_1M_BS256M_PS4M);
- deleteIfExists(configuration, file_1M_BS256M_PS8M);
- deleteIfExists(configuration, file_1M_BS512M_PS4M);
- deleteIfExists(configuration, file_1M_BS512M_PS8M);
-// deleteIfExists(configuration, parquetFile_1M_LZO);
- deleteIfExists(configuration, file_1M_SNAPPY);
- deleteIfExists(configuration, file_1M_GZIP);
+ deleteIfExists(configuration, targetDir);
}
public static void main(String[] args) {
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumDataGenerator.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumDataGenerator.java
index 6c62cc6..49ebdce 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumDataGenerator.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumDataGenerator.java
@@ -40,7 +40,7 @@ import static org.apache.parquet.benchmarks.BenchmarkUtils.deleteIfExists;
import static org.apache.parquet.benchmarks.BenchmarkUtils.exists;
import static org.apache.parquet.hadoop.metadata.CompressionCodecName.*;
-public class PageChecksumDataGenerator {
+public class PageChecksumDataGenerator extends DataGenerator {
private final MessageType SCHEMA = MessageTypeParser.parseMessageType(
"message m {" +
@@ -103,25 +103,4 @@ public class PageChecksumDataGenerator {
throw new RuntimeException(e);
}
}
-
- public void cleanup() {
- deleteIfExists(configuration, file_100K_NOCHECKSUMS_UNCOMPRESSED);
- deleteIfExists(configuration, file_100K_CHECKSUMS_UNCOMPRESSED);
- deleteIfExists(configuration, file_100K_NOCHECKSUMS_GZIP);
- deleteIfExists(configuration, file_100K_CHECKSUMS_GZIP);
- deleteIfExists(configuration, file_100K_NOCHECKSUMS_SNAPPY);
- deleteIfExists(configuration, file_100K_CHECKSUMS_SNAPPY);
- deleteIfExists(configuration, file_1M_NOCHECKSUMS_UNCOMPRESSED);
- deleteIfExists(configuration, file_1M_CHECKSUMS_UNCOMPRESSED);
- deleteIfExists(configuration, file_1M_NOCHECKSUMS_GZIP);
- deleteIfExists(configuration, file_1M_CHECKSUMS_GZIP);
- deleteIfExists(configuration, file_1M_NOCHECKSUMS_SNAPPY);
- deleteIfExists(configuration, file_1M_CHECKSUMS_SNAPPY);
- deleteIfExists(configuration, file_10M_NOCHECKSUMS_UNCOMPRESSED);
- deleteIfExists(configuration, file_10M_CHECKSUMS_UNCOMPRESSED);
- deleteIfExists(configuration, file_10M_NOCHECKSUMS_GZIP);
- deleteIfExists(configuration, file_10M_CHECKSUMS_GZIP);
- deleteIfExists(configuration, file_10M_NOCHECKSUMS_SNAPPY);
- deleteIfExists(configuration, file_10M_CHECKSUMS_SNAPPY);
- }
}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumReadBenchmarks.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumReadBenchmarks.java
index db23eeb..be2ebe4 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumReadBenchmarks.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumReadBenchmarks.java
@@ -51,16 +51,15 @@ public class PageChecksumReadBenchmarks {
private PageChecksumDataGenerator pageChecksumDataGenerator = new PageChecksumDataGenerator();
+ /**
+ * This needs to be done exactly once. To avoid needlessly regenerating the files for reading, they aren't cleaned
+ * as part of the benchmark. If the files exist, a message will be printed and they will not be regenerated.
+ */
@Setup(Level.Trial)
public void setup() {
pageChecksumDataGenerator.generateAll();
}
- @Setup(Level.Trial)
- public void cleanup() {
- pageChecksumDataGenerator.cleanup();
- }
-
private void readFile(Path file, int nRows, boolean verifyChecksums, Blackhole blackhole)
throws IOException {
try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file)
@@ -82,96 +81,114 @@ public class PageChecksumReadBenchmarks {
// 100k rows, uncompressed, GZIP, Snappy
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read100KRowsUncompressedWithoutVerification(Blackhole blackhole) throws IOException {
readFile(file_100K_CHECKSUMS_UNCOMPRESSED, 100 * ONE_K, false, blackhole);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read100KRowsUncompressedWithVerification(Blackhole blackhole) throws IOException {
readFile(file_100K_CHECKSUMS_UNCOMPRESSED, 100 * ONE_K, true, blackhole);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read100KRowsGzipWithoutVerification(Blackhole blackhole) throws IOException {
readFile(file_100K_CHECKSUMS_GZIP, 100 * ONE_K, false, blackhole);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read100KRowsGzipWithVerification(Blackhole blackhole) throws IOException {
readFile(file_100K_CHECKSUMS_GZIP, 100 * ONE_K, true, blackhole);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read100KRowsSnappyWithoutVerification(Blackhole blackhole) throws IOException {
readFile(file_100K_CHECKSUMS_SNAPPY, 100 * ONE_K, false, blackhole);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read100KRowsSnappyWithVerification(Blackhole blackhole) throws IOException {
readFile(file_100K_CHECKSUMS_SNAPPY, 100 * ONE_K, true, blackhole);
}
// 1M rows, uncompressed, GZIP, Snappy
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read1MRowsUncompressedWithoutVerification(Blackhole blackhole) throws IOException {
readFile(file_1M_CHECKSUMS_UNCOMPRESSED, ONE_MILLION, false, blackhole);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read1MRowsUncompressedWithVerification(Blackhole blackhole) throws IOException {
readFile(file_1M_CHECKSUMS_UNCOMPRESSED, ONE_MILLION, true, blackhole);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read1MRowsGzipWithoutVerification(Blackhole blackhole) throws IOException {
readFile(file_1M_CHECKSUMS_GZIP, ONE_MILLION, false, blackhole);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read1MRowsGzipWithVerification(Blackhole blackhole) throws IOException {
readFile(file_1M_CHECKSUMS_GZIP, ONE_MILLION, true, blackhole);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read1MRowsSnappyWithoutVerification(Blackhole blackhole) throws IOException {
readFile(file_1M_CHECKSUMS_SNAPPY, ONE_MILLION, false, blackhole);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read1MRowsSnappyWithVerification(Blackhole blackhole) throws IOException {
readFile(file_1M_CHECKSUMS_SNAPPY, ONE_MILLION, true, blackhole);
}
// 10M rows, uncompressed, GZIP, Snappy
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read10MRowsUncompressedWithoutVerification(Blackhole blackhole) throws IOException {
readFile(file_10M_CHECKSUMS_UNCOMPRESSED, 10 * ONE_MILLION, false, blackhole);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read10MRowsUncompressedWithVerification(Blackhole blackhole) throws IOException {
readFile(file_10M_CHECKSUMS_UNCOMPRESSED, 10 * ONE_MILLION, true, blackhole);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read10MRowsGzipWithoutVerification(Blackhole blackhole) throws IOException {
readFile(file_10M_CHECKSUMS_GZIP, 10 * ONE_MILLION, false, blackhole);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read10MRowsGzipWithVerification(Blackhole blackhole) throws IOException {
readFile(file_10M_CHECKSUMS_GZIP, 10 * ONE_MILLION, true, blackhole);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read10MRowsSnappyWithoutVerification(Blackhole blackhole) throws IOException {
readFile(file_10M_CHECKSUMS_SNAPPY, 10 * ONE_MILLION, false, blackhole);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read10MRowsSnappyWithVerification(Blackhole blackhole) throws IOException {
readFile(file_10M_CHECKSUMS_SNAPPY, 10 * ONE_MILLION, true, blackhole);
}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumWriteBenchmarks.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumWriteBenchmarks.java
index c743dde..e892d53 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumWriteBenchmarks.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumWriteBenchmarks.java
@@ -57,102 +57,120 @@ public class PageChecksumWriteBenchmarks {
private PageChecksumDataGenerator pageChecksumDataGenerator = new PageChecksumDataGenerator();
@Setup(Level.Iteration)
- public void cleanup() {
+ public void setup() {
pageChecksumDataGenerator.cleanup();
}
// 100k rows, uncompressed, GZIP, Snappy
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write100KRowsUncompressedWithoutChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_100K_NOCHECKSUMS_UNCOMPRESSED, 100 * ONE_K, false, UNCOMPRESSED);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write100KRowsUncompressedWithChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_100K_CHECKSUMS_UNCOMPRESSED, 100 * ONE_K, true, UNCOMPRESSED);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write100KRowsGzipWithoutChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_100K_NOCHECKSUMS_GZIP, 100 * ONE_K, false, GZIP);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write100KRowsGzipWithChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_100K_CHECKSUMS_GZIP, 100 * ONE_K, true, GZIP);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write100KRowsSnappyWithoutChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_100K_NOCHECKSUMS_SNAPPY, 100 * ONE_K, false, SNAPPY);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write100KRowsSnappyWithChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_100K_CHECKSUMS_SNAPPY, 100 * ONE_K, true, SNAPPY);
}
// 1M rows, uncompressed, GZIP, Snappy
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write1MRowsUncompressedWithoutChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_1M_NOCHECKSUMS_UNCOMPRESSED, ONE_MILLION, false, UNCOMPRESSED);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write1MRowsUncompressedWithChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_1M_CHECKSUMS_UNCOMPRESSED, ONE_MILLION, true, UNCOMPRESSED);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write1MRowsGzipWithoutChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_1M_NOCHECKSUMS_GZIP, ONE_MILLION, false, GZIP);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write1MRowsGzipWithChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_1M_CHECKSUMS_GZIP, ONE_MILLION, true, GZIP);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write1MRowsSnappyWithoutChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_1M_NOCHECKSUMS_SNAPPY, ONE_MILLION, false, SNAPPY);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write1MRowsSnappyWithChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_1M_CHECKSUMS_SNAPPY, ONE_MILLION, true, SNAPPY);
}
// 10M rows, uncompressed, GZIP, Snappy
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write10MRowsUncompressedWithoutChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_10M_NOCHECKSUMS_UNCOMPRESSED, 10 * ONE_MILLION, false, UNCOMPRESSED);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write10MRowsUncompressedWithChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_10M_CHECKSUMS_UNCOMPRESSED, 10 * ONE_MILLION, true, UNCOMPRESSED);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write10MRowsGzipWithoutChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_10M_NOCHECKSUMS_GZIP, 10 * ONE_MILLION, false, GZIP);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write10MRowsGzipWithChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_10M_CHECKSUMS_GZIP, 10 * ONE_MILLION, true, GZIP);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write10MRowsSnappyWithoutChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_10M_NOCHECKSUMS_SNAPPY, 10 * ONE_MILLION, false, SNAPPY);
}
- @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+ @Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write10MRowsSnappyWithChecksums() throws IOException {
pageChecksumDataGenerator.generateData(file_10M_CHECKSUMS_SNAPPY, 10 * ONE_MILLION, true, SNAPPY);
}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ReadBenchmarks.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ReadBenchmarks.java
index dba5544..e74204a 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ReadBenchmarks.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ReadBenchmarks.java
@@ -20,6 +20,13 @@ package org.apache.parquet.benchmarks;
import org.apache.hadoop.fs.Path;
import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.infra.Blackhole;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.hadoop.ParquetReader;
@@ -29,7 +36,9 @@ import static org.apache.parquet.benchmarks.BenchmarkFiles.*;
import java.io.IOException;
+@State(Scope.Benchmark)
public class ReadBenchmarks {
+
private void read(Path parquetFile, int nRows, Blackhole blackhole) throws IOException
{
ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), parquetFile).withConf(configuration).build();
@@ -47,7 +56,17 @@ public class ReadBenchmarks {
reader.close();
}
+ /**
+ * This needs to be done exactly once. To avoid needlessly regenerating the files for reading, they aren't cleaned
+ * as part of the benchmark. If the files exist, a message will be printed and they will not be regenerated.
+ */
+ @Setup(Level.Trial)
+ public void generateFilesForRead() {
+ new DataGenerator().generateAll();
+ }
+
@Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read1MRowsDefaultBlockAndPageSizeUncompressed(Blackhole blackhole)
throws IOException
{
@@ -55,6 +74,7 @@ public class ReadBenchmarks {
}
@Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read1MRowsBS256MPS4MUncompressed(Blackhole blackhole)
throws IOException
{
@@ -62,6 +82,7 @@ public class ReadBenchmarks {
}
@Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read1MRowsBS256MPS8MUncompressed(Blackhole blackhole)
throws IOException
{
@@ -69,6 +90,7 @@ public class ReadBenchmarks {
}
@Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read1MRowsBS512MPS4MUncompressed(Blackhole blackhole)
throws IOException
{
@@ -76,6 +98,7 @@ public class ReadBenchmarks {
}
@Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read1MRowsBS512MPS8MUncompressed(Blackhole blackhole)
throws IOException
{
@@ -91,6 +114,7 @@ public class ReadBenchmarks {
// }
@Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read1MRowsDefaultBlockAndPageSizeSNAPPY(Blackhole blackhole)
throws IOException
{
@@ -98,6 +122,7 @@ public class ReadBenchmarks {
}
@Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void read1MRowsDefaultBlockAndPageSizeGZIP(Blackhole blackhole)
throws IOException
{
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/WriteBenchmarks.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/WriteBenchmarks.java
index 5c26a84..0a2d2c0 100644
--- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/WriteBenchmarks.java
+++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/WriteBenchmarks.java
@@ -19,7 +19,9 @@
package org.apache.parquet.benchmarks;
import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
@@ -39,12 +41,13 @@ public class WriteBenchmarks {
private DataGenerator dataGenerator = new DataGenerator();
@Setup(Level.Iteration)
- public void cleanup() {
+ public void setup() {
//clean existing test data at the beginning of each iteration
dataGenerator.cleanup();
}
@Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write1MRowsDefaultBlockAndPageSizeUncompressed()
throws IOException
{
@@ -59,6 +62,7 @@ public class WriteBenchmarks {
}
@Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write1MRowsBS256MPS4MUncompressed()
throws IOException
{
@@ -73,6 +77,7 @@ public class WriteBenchmarks {
}
@Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write1MRowsBS256MPS8MUncompressed()
throws IOException
{
@@ -87,6 +92,7 @@ public class WriteBenchmarks {
}
@Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write1MRowsBS512MPS4MUncompressed()
throws IOException
{
@@ -101,6 +107,7 @@ public class WriteBenchmarks {
}
@Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write1MRowsBS512MPS8MUncompressed()
throws IOException
{
@@ -130,6 +137,7 @@ public class WriteBenchmarks {
// }
@Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write1MRowsDefaultBlockAndPageSizeSNAPPY()
throws IOException
{
@@ -144,6 +152,7 @@ public class WriteBenchmarks {
}
@Benchmark
+ @BenchmarkMode(Mode.SingleShotTime)
public void write1MRowsDefaultBlockAndPageSizeGZIP()
throws IOException
{
diff --git a/parquet-benchmarks/run_checksums.sh b/parquet-benchmarks/src/main/resources/log4j.properties
old mode 100755
new mode 100644
similarity index 68%
rename from parquet-benchmarks/run_checksums.sh
rename to parquet-benchmarks/src/main/resources/log4j.properties
index e798488..f4737c8
--- a/parquet-benchmarks/run_checksums.sh
+++ b/parquet-benchmarks/src/main/resources/log4j.properties
@@ -1,4 +1,3 @@
-#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
@@ -17,12 +16,9 @@
# under the License.
#
-# !/usr/bin/env bash
-
-SCRIPT_PATH=$( cd "$(dirname "$0")" ; pwd -P )
+log4j.rootLogger=INFO, stdout
-echo "Page level CRC checksum benchmarks"
-echo "Running write benchmarks"
-java -jar ${SCRIPT_PATH}/target/parquet-benchmarks.jar p*PageChecksumWriteBenchmarks -bm ss "$@"
-echo "Running read benchmarks"
-java -jar ${SCRIPT_PATH}/target/parquet-benchmarks.jar p*PageChecksumReadBenchmarks -bm ss "$@"
+log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.Target=System.out
+log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p :: %m [%C]%n