You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/08/14 22:24:09 UTC
[orc] branch branch-1.7 updated: ORC-927: Extracting duplicate
codes for RowFilterBenchmark (#841)
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-1.7
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-1.7 by this push:
new 4052587 ORC-927: Extracting duplicate codes for RowFilterBenchmark (#841)
4052587 is described below
commit 405258771330b50a3857e2795d7e93d8719c85fa
Author: guiyanakaung <gu...@gmail.com>
AuthorDate: Tue Aug 10 12:03:03 2021 +0800
ORC-927: Extracting duplicate codes for RowFilterBenchmark (#841)
### What changes were proposed in this pull request?
Create a new class RowFilterInputState to hold duplicate code.
`filterValues`, `generateRandomSet` and `customIntRowFilter` previously existed in different internal classes as static variables/methods and do not affect each other. After refactoring, they were changed to instance variables/methods to avoid interaction between different benchmarks.
### Why are the changes needed?
Easier to maintain.
### How was this patch tested?
Pass the CIs.
(cherry picked from commit 33fcc2637393c2088860106a22f70532b7562be6)
Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
java/bench/hive/src/findbugs/exclude.xml | 2 +-
.../hive/rowfilter/BooleanRowFilterBenchmark.java | 83 +--------------
.../hive/rowfilter/DecimalRowFilterBenchmark.java | 83 +--------------
.../hive/rowfilter/DoubleRowFilterBenchmark.java | 82 +--------------
.../bench/hive/rowfilter/RowFilterInputState.java | 116 +++++++++++++++++++++
.../hive/rowfilter/StringRowFilterBenchmark.java | 83 +--------------
.../rowfilter/TimestampRowFilterBenchmark.java | 80 +-------------
7 files changed, 142 insertions(+), 387 deletions(-)
diff --git a/java/bench/hive/src/findbugs/exclude.xml b/java/bench/hive/src/findbugs/exclude.xml
index 64f2e31..6fec73c 100644
--- a/java/bench/hive/src/findbugs/exclude.xml
+++ b/java/bench/hive/src/findbugs/exclude.xml
@@ -14,7 +14,7 @@
-->
<FindBugsFilter>
<Match>
- <Bug pattern="EI_EXPOSE_REP,EI_EXPOSE_REP2,MS_EXPOSE_REP"/>
+ <Bug pattern="EI_EXPOSE_REP,EI_EXPOSE_REP2,MS_EXPOSE_REP,DM_EXIT"/>
</Match>
<Match>
<Class name="~org\.openjdk\.jmh\.infra\.generated.*"/>
diff --git a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/BooleanRowFilterBenchmark.java b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/BooleanRowFilterBenchmark.java
index c360e45..606d5c5 100644
--- a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/BooleanRowFilterBenchmark.java
+++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/BooleanRowFilterBenchmark.java
@@ -17,16 +17,8 @@
*/
package org.apache.orc.bench.hive.rowfilter;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.orc.OrcFile;
-import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
-import org.apache.orc.bench.core.Utilities;
-import org.apache.orc.OrcFilterContext;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
@@ -35,7 +27,6 @@ import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
-import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
@@ -43,8 +34,6 @@ import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.OptionsBuilder;
-import java.io.IOException;
-import java.util.Random;
import java.util.concurrent.TimeUnit;
@State(Scope.Benchmark)
@@ -55,10 +44,8 @@ import java.util.concurrent.TimeUnit;
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public class BooleanRowFilterBenchmark extends org.openjdk.jmh.Main {
- private static final Path root = new Path(System.getProperty("user.dir"));
-
@State(Scope.Thread)
- public static class InputState {
+ public static class InputState extends RowFilterInputState {
@Param({"ORIGINAL"})
public TypeDescription.RowBatchVersion version;
@@ -72,79 +59,19 @@ public class BooleanRowFilterBenchmark extends org.openjdk.jmh.Main {
@Param({"2"})
public int filterColsNum;
- Configuration conf = new Configuration();
- FileSystem fs;
- TypeDescription schema;
- VectorizedRowBatch batch;
- Path path;
- boolean[] include;
- Reader reader;
- Reader.Options readerOptions;
- String filter_column = "sales_id";
+ String dataRelativePath = "data/generated/sales/orc.none";
- @Setup
- public void setup() throws IOException {
- fs = FileSystem.getLocal(conf).getRaw();
- path = new Path(root, "data/generated/sales/orc.none");
- schema = Utilities.loadSchema("sales.schema");
- batch = schema.createRowBatch(version, 1024);
- include = new boolean[schema.getMaximumId() + 1];
- for(TypeDescription child: schema.getChildren()) {
- if (schema.getFieldNames().get(child.getId()-1).compareTo(filter_column) == 0) {
- System.out.println("Apply Filter on column: " + schema.getFieldNames().get(child.getId()-1));
- include[child.getId()] = true;
- } else if (child.getCategory() == benchType) {
- System.out.println("Skip column(s): " + schema.getFieldNames().get(child.getId()-1));
- include[child.getId()] = true;
- if (--filterColsNum == 0) break;
- }
- }
- if (filterColsNum != 0) {
- System.err.println("Dataset does not contain type: "+ benchType);
- System.exit(-1);
- }
- generateRandomSet(Double.parseDouble(filterPerc));
- reader = OrcFile.createReader(path,
- OrcFile.readerOptions(conf).filesystem(fs));
- // just read the Boolean columns
- readerOptions = reader.options().include(include);
- }
+ String schemaName = "sales.schema";
- static boolean[] filterValues = null;
- public static boolean[] generateRandomSet(double percentage) throws IllegalArgumentException {
- if (percentage > 1.0) {
- throw new IllegalArgumentException("Filter percentage must be < 1.0 but was "+ percentage);
- }
- filterValues = new boolean[1024];
- int count = 0;
- while (count < (1024 * percentage)) {
- Random randomGenerator = new Random();
- int randVal = randomGenerator.nextInt(1024);
- if (filterValues[randVal] == false) {
- filterValues[randVal] = true;
- count++;
- }
- }
- return filterValues;
- }
+ String filterColumn = "sales_id";
- public static void customIntRowFilter(OrcFilterContext batch) {
- int newSize = 0;
- for (int row = 0; row < batch.getSelectedSize(); ++row) {
- if (filterValues[row]) {
- batch.getSelected()[newSize++] = row;
- }
- }
- batch.setSelectedInUse(true);
- batch.setSelectedSize(newSize);
- }
}
@Benchmark
public void readOrcRowFilter(Blackhole blackhole, InputState state) throws Exception {
RecordReader rows =
state.reader.rows(state.readerOptions
- .setRowFilter(new String[]{state.filter_column}, InputState::customIntRowFilter));
+ .setRowFilter(new String[]{state.filterColumn}, state::customIntRowFilter));
while (rows.nextBatch(state.batch)) {
blackhole.consume(state.batch);
}
diff --git a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/DecimalRowFilterBenchmark.java b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/DecimalRowFilterBenchmark.java
index 526f333..59fddf6 100644
--- a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/DecimalRowFilterBenchmark.java
+++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/DecimalRowFilterBenchmark.java
@@ -17,16 +17,8 @@
*/
package org.apache.orc.bench.hive.rowfilter;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.orc.OrcFile;
-import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
-import org.apache.orc.bench.core.Utilities;
-import org.apache.orc.OrcFilterContext;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
@@ -35,7 +27,6 @@ import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
-import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
@@ -43,8 +34,6 @@ import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.OptionsBuilder;
-import java.io.IOException;
-import java.util.Random;
import java.util.concurrent.TimeUnit;
@State(Scope.Benchmark)
@@ -55,10 +44,8 @@ import java.util.concurrent.TimeUnit;
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public class DecimalRowFilterBenchmark extends org.openjdk.jmh.Main {
- private static final Path root = new Path(System.getProperty("user.dir"));
-
@State(Scope.Thread)
- public static class InputState {
+ public static class InputState extends RowFilterInputState {
// try both DecimalColumnVector and Decimal64
@Param({"ORIGINAL", "USE_DECIMAL64"})
@@ -73,79 +60,19 @@ public class DecimalRowFilterBenchmark extends org.openjdk.jmh.Main {
@Param({"2"})
public int filterColsNum;
- Configuration conf = new Configuration();
- FileSystem fs;
- TypeDescription schema;
- VectorizedRowBatch batch;
- Path path;
- boolean[] include;
- Reader reader;
- Reader.Options readerOptions;
- String filter_column = "vendor_id";
+ String dataRelativePath = "data/generated/taxi/orc.none";
- @Setup
- public void setup() throws IOException {
- fs = FileSystem.getLocal(conf).getRaw();
- path = new Path(root, "data/generated/taxi/orc.none");
- schema = Utilities.loadSchema("taxi.schema");
- batch = schema.createRowBatch(version, 1024);
- include = new boolean[schema.getMaximumId() + 1];
- for(TypeDescription child: schema.getChildren()) {
- if (schema.getFieldNames().get(child.getId()-1).compareTo(filter_column) == 0) {
- System.out.println("Apply Filter on column: " + schema.getFieldNames().get(child.getId()-1));
- include[child.getId()] = true;
- } else if (child.getCategory() == benchType) {
- System.out.println("Skip column(s): " + schema.getFieldNames().get(child.getId()-1));
- include[child.getId()] = true;
- if (--filterColsNum == 0) break;
- }
- }
- if (filterColsNum != 0) {
- System.err.println("Dataset does not contain type: "+ benchType);
- System.exit(-1);
- }
- generateRandomSet(Double.parseDouble(filterPerc));
- reader = OrcFile.createReader(path,
- OrcFile.readerOptions(conf).filesystem(fs));
- // just read the Decimal columns
- readerOptions = reader.options().include(include);
- }
+ String schemaName = "taxi.schema";
- static boolean[] filterValues = null;
- public static boolean[] generateRandomSet(double percentage) throws IllegalArgumentException {
- if (percentage > 1.0) {
- throw new IllegalArgumentException("Filter percentage must be < 1.0 but was "+ percentage);
- }
- filterValues = new boolean[1024];
- int count = 0;
- while (count < (1024 * percentage)) {
- Random randomGenerator = new Random();
- int randVal = randomGenerator.nextInt(1024);
- if (filterValues[randVal] == false) {
- filterValues[randVal] = true;
- count++;
- }
- }
- return filterValues;
- }
+ String filterColumn = "vendor_id";
- public static void customIntRowFilter(OrcFilterContext batch) {
- int newSize = 0;
- for (int row = 0; row < batch.getSelectedSize(); ++row) {
- if (filterValues[row]) {
- batch.getSelected()[newSize++] = row;
- }
- }
- batch.setSelectedInUse(true);
- batch.setSelectedSize(newSize);
- }
}
@Benchmark
public void readOrcRowFilter(Blackhole blackhole, InputState state) throws Exception {
RecordReader rows =
state.reader.rows(state.readerOptions
- .setRowFilter(new String[]{state.filter_column}, InputState::customIntRowFilter));
+ .setRowFilter(new String[]{state.filterColumn}, state::customIntRowFilter));
while (rows.nextBatch(state.batch)) {
blackhole.consume(state.batch);
}
diff --git a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/DoubleRowFilterBenchmark.java b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/DoubleRowFilterBenchmark.java
index d6b9a4d..5ce87b8 100644
--- a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/DoubleRowFilterBenchmark.java
+++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/DoubleRowFilterBenchmark.java
@@ -17,16 +17,8 @@
*/
package org.apache.orc.bench.hive.rowfilter;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.orc.OrcFile;
-import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
-import org.apache.orc.bench.core.Utilities;
-import org.apache.orc.OrcFilterContext;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
@@ -35,7 +27,6 @@ import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
-import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
@@ -43,8 +34,6 @@ import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.OptionsBuilder;
-import java.io.IOException;
-import java.util.Random;
import java.util.concurrent.TimeUnit;
@State(Scope.Benchmark)
@@ -54,10 +43,9 @@ import java.util.concurrent.TimeUnit;
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public class DoubleRowFilterBenchmark extends org.openjdk.jmh.Main {
- private static final Path root = new Path(System.getProperty("user.dir"));
@State(Scope.Thread)
- public static class InputState {
+ public static class InputState extends RowFilterInputState {
// try both DecimalColumnVector and Decimal64
@Param({"ORIGINAL"})
@@ -72,79 +60,19 @@ public class DoubleRowFilterBenchmark extends org.openjdk.jmh.Main {
@Param({"2"})
public int filterColsNum;
- Configuration conf = new Configuration();
- FileSystem fs;
- TypeDescription schema;
- VectorizedRowBatch batch;
- Path path;
- boolean[] include;
- Reader reader;
- Reader.Options readerOptions;
- String filter_column = "vendor_id";
+ String dataRelativePath = "data/generated/taxi/orc.none";
- @Setup
- public void setup() throws IOException {
- fs = FileSystem.getLocal(conf).getRaw();
- path = new Path(root, "data/generated/taxi/orc.none");
- schema = Utilities.loadSchema("taxi.schema");
- batch = schema.createRowBatch(version, 1024);
- include = new boolean[schema.getMaximumId() + 1];
- for(TypeDescription child: schema.getChildren()) {
- if (schema.getFieldNames().get(child.getId()-1).compareTo(filter_column) == 0) {
- System.out.println("Apply Filter on column: " + schema.getFieldNames().get(child.getId()-1));
- include[child.getId()] = true;
- } else if (child.getCategory() == benchType) {
- System.out.println("Skip column(s): " + schema.getFieldNames().get(child.getId()-1));
- include[child.getId()] = true;
- if (--filterColsNum == 0) break;
- }
- }
- if (filterColsNum != 0) {
- System.err.println("Dataset does not contain type: "+ benchType);
- System.exit(-1);
- }
- generateRandomSet(Double.parseDouble(filterPerc));
- reader = OrcFile.createReader(path,
- OrcFile.readerOptions(conf).filesystem(fs));
- // just read the Double columns
- readerOptions = reader.options().include(include);
- }
+ String schemaName = "taxi.schema";
- static boolean[] filterValues = null;
- public static boolean[] generateRandomSet(double percentage) throws IllegalArgumentException {
- if (percentage > 1.0) {
- throw new IllegalArgumentException("Filter percentage must be < 1.0 but was "+ percentage);
- }
- filterValues = new boolean[1024];
- int count = 0;
- while (count < (1024 * percentage)) {
- Random randomGenerator = new Random();
- int randVal = randomGenerator.nextInt(1024);
- if (filterValues[randVal] == false) {
- filterValues[randVal] = true;
- count++;
- }
- }
- return filterValues;
- }
+ String filterColumn = "vendor_id";
- public static void customIntRowFilter(OrcFilterContext batch) {
- int newSize = 0;
- for (int row = 0; row < batch.getSelectedSize(); ++row) {
- if (filterValues[row]) {
- batch.getSelected()[newSize++] = row;
- }
- }
- batch.setSelectedInUse(true);
- batch.setSelectedSize(newSize);
- }
}
@Benchmark
public void readOrcRowFilter(Blackhole blackhole, InputState state) throws Exception {
RecordReader rows =
state.reader.rows(state.readerOptions
- .setRowFilter(new String[]{state.filter_column}, InputState::customIntRowFilter));
+ .setRowFilter(new String[]{state.filterColumn}, state::customIntRowFilter));
while (rows.nextBatch(state.batch)) {
blackhole.consume(state.batch);
}
diff --git a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/RowFilterInputState.java b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/RowFilterInputState.java
new file mode 100644
index 0000000..dba28f8
--- /dev/null
+++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/RowFilterInputState.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.bench.hive.rowfilter;
+
+import org.apache.commons.lang.reflect.FieldUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.OrcFile;
+import org.apache.orc.OrcFilterContext;
+import org.apache.orc.Reader;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.core.Utilities;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+
+import java.io.IOException;
+import java.util.Random;
+
+@State(Scope.Thread)
+public abstract class RowFilterInputState {
+
+ private static final Path root = new Path(System.getProperty("user.dir"));
+
+ Configuration conf = new Configuration();
+ FileSystem fs;
+ TypeDescription schema;
+ VectorizedRowBatch batch;
+ Path path;
+ boolean[] include;
+ Reader reader;
+ Reader.Options readerOptions;
+ boolean[] filterValues = null;
+
+ @Setup
+ public void setup() throws IOException, IllegalAccessException {
+ TypeDescription.RowBatchVersion version =
+ (TypeDescription.RowBatchVersion) FieldUtils.readField(this, "version", true);
+ TypeDescription.Category benchType = (TypeDescription.Category) FieldUtils.readField(this, "benchType", true);
+ String filterPerc = (String) FieldUtils.readField(this, "filterPerc", true);
+ int filterColsNum = (int) FieldUtils.readField(this, "filterColsNum", true);
+ String dataRelativePath = (String) FieldUtils.readField(this, "dataRelativePath", true);
+ String schemaName = (String) FieldUtils.readField(this, "schemaName", true);
+ String filterColumn = (String) FieldUtils.readField(this, "filterColumn", true);
+
+ fs = FileSystem.getLocal(conf).getRaw();
+ path = new Path(root, dataRelativePath);
+ schema = Utilities.loadSchema(schemaName);
+ batch = schema.createRowBatch(version, 1024);
+ include = new boolean[schema.getMaximumId() + 1];
+ for (TypeDescription child : schema.getChildren()) {
+ if (schema.getFieldNames().get(child.getId() - 1).compareTo(filterColumn) == 0) {
+ System.out.println("Apply Filter on column: " + schema.getFieldNames().get(child.getId() - 1));
+ include[child.getId()] = true;
+ } else if (child.getCategory() == benchType) {
+ System.out.println("Skip column(s): " + schema.getFieldNames().get(child.getId() - 1));
+ include[child.getId()] = true;
+ if (--filterColsNum == 0) break;
+ }
+ }
+ if (filterColsNum != 0) {
+ System.err.println("Dataset does not contain type: " + benchType);
+ System.exit(-1);
+ }
+ generateRandomSet(Double.parseDouble(filterPerc));
+ reader = OrcFile.createReader(path,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ // just read the Boolean columns
+ readerOptions = reader.options().include(include);
+ }
+
+ public void generateRandomSet(double percentage) throws IllegalArgumentException {
+ if (percentage > 1.0) {
+ throw new IllegalArgumentException("Filter percentage must be < 1.0 but was " + percentage);
+ }
+ filterValues = new boolean[1024];
+ int count = 0;
+ while (count < (1024 * percentage)) {
+ Random randomGenerator = new Random();
+ int randVal = randomGenerator.nextInt(1024);
+ if (!filterValues[randVal]) {
+ filterValues[randVal] = true;
+ count++;
+ }
+ }
+ }
+
+ public void customIntRowFilter(OrcFilterContext batch) {
+ int newSize = 0;
+ for (int row = 0; row < batch.getSelectedSize(); ++row) {
+ if (filterValues[row]) {
+ batch.getSelected()[newSize++] = row;
+ }
+ }
+ batch.setSelectedInUse(true);
+ batch.setSelectedSize(newSize);
+ }
+
+}
diff --git a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/StringRowFilterBenchmark.java b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/StringRowFilterBenchmark.java
index d2f8f48..bc12fbe 100644
--- a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/StringRowFilterBenchmark.java
+++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/StringRowFilterBenchmark.java
@@ -17,16 +17,8 @@
*/
package org.apache.orc.bench.hive.rowfilter;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.orc.OrcFile;
-import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
-import org.apache.orc.bench.core.Utilities;
-import org.apache.orc.OrcFilterContext;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
@@ -35,7 +27,6 @@ import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
-import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
@@ -43,8 +34,6 @@ import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.OptionsBuilder;
-import java.io.IOException;
-import java.util.Random;
import java.util.concurrent.TimeUnit;
@State(Scope.Benchmark)
@@ -55,10 +44,8 @@ import java.util.concurrent.TimeUnit;
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public class StringRowFilterBenchmark extends org.openjdk.jmh.Main {
- private static final Path root = new Path(System.getProperty("user.dir"));
-
@State(Scope.Thread)
- public static class InputState {
+ public static class InputState extends RowFilterInputState {
@Param({"ORIGINAL"})
public TypeDescription.RowBatchVersion version;
@@ -72,79 +59,19 @@ public class StringRowFilterBenchmark extends org.openjdk.jmh.Main {
@Param({"2"})
public int filterColsNum;
- Configuration conf = new Configuration();
- FileSystem fs;
- TypeDescription schema;
- VectorizedRowBatch batch;
- Path path;
- boolean[] include;
- Reader reader;
- Reader.Options readerOptions;
- String filter_column = "sales_id";
+ String dataRelativePath = "data/generated/sales/orc.none";
- @Setup
- public void setup() throws IOException {
- fs = FileSystem.getLocal(conf).getRaw();
- path = new Path(root, "data/generated/sales/orc.none");
- schema = Utilities.loadSchema("sales.schema");
- batch = schema.createRowBatch(version, 1024);
- include = new boolean[schema.getMaximumId() + 1];
- for(TypeDescription child: schema.getChildren()) {
- if (schema.getFieldNames().get(child.getId()-1).compareTo(filter_column) == 0) {
- System.out.println("Apply Filter on column: " + schema.getFieldNames().get(child.getId()-1));
- include[child.getId()] = true;
- } else if (child.getCategory() == benchType) {
- System.out.println("Skip column(s): " + schema.getFieldNames().get(child.getId()-1));
- include[child.getId()] = true;
- if (--filterColsNum == 0) break;
- }
- }
- if (filterColsNum != 0) {
- System.err.println("Dataset does not contain type: "+ benchType);
- System.exit(-1);
- }
- generateRandomSet(Double.parseDouble(filterPerc));
- reader = OrcFile.createReader(path,
- OrcFile.readerOptions(conf).filesystem(fs));
- // just read the String columns
- readerOptions = reader.options().include(include);
- }
+ String schemaName = "sales.schema";
- static boolean[] filterValues = null;
- public static boolean[] generateRandomSet(double percentage) throws IllegalArgumentException {
- if (percentage > 1.0) {
- throw new IllegalArgumentException("Filter percentage must be < 1.0 but was "+ percentage);
- }
- filterValues = new boolean[1024];
- int count = 0;
- while (count < (1024 * percentage)) {
- Random randomGenerator = new Random();
- int randVal = randomGenerator.nextInt(1024);
- if (filterValues[randVal] == false) {
- filterValues[randVal] = true;
- count++;
- }
- }
- return filterValues;
- }
+ String filterColumn = "sales_id";
- public static void customIntRowFilter(OrcFilterContext batch) {
- int newSize = 0;
- for (int row = 0; row < batch.getSelectedSize(); ++row) {
- if (filterValues[row]) {
- batch.getSelected()[newSize++] = row;
- }
- }
- batch.setSelectedInUse(true);
- batch.setSelectedSize(newSize);
- }
}
@Benchmark
public void readOrcRowFilter(Blackhole blackhole, InputState state) throws Exception {
RecordReader rows =
state.reader.rows(state.readerOptions
- .setRowFilter(new String[]{state.filter_column}, InputState::customIntRowFilter));
+ .setRowFilter(new String[]{state.filterColumn}, state::customIntRowFilter));
while (rows.nextBatch(state.batch)) {
blackhole.consume(state.batch);
}
diff --git a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/TimestampRowFilterBenchmark.java b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/TimestampRowFilterBenchmark.java
index c4994eb..a8aa6ba 100644
--- a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/TimestampRowFilterBenchmark.java
+++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/TimestampRowFilterBenchmark.java
@@ -17,16 +17,9 @@
*/
package org.apache.orc.bench.hive.rowfilter;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.orc.OrcFile;
-import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
-import org.apache.orc.bench.core.Utilities;
-import org.apache.orc.OrcFilterContext;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
@@ -35,7 +28,6 @@ import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
-import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
@@ -43,8 +35,6 @@ import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.OptionsBuilder;
-import java.io.IOException;
-import java.util.Random;
import java.util.concurrent.TimeUnit;
@State(Scope.Benchmark)
@@ -57,7 +47,7 @@ public class TimestampRowFilterBenchmark extends org.openjdk.jmh.Main {
private static final Path root = new Path(System.getProperty("user.dir"));
@State(Scope.Thread)
- public static class InputState {
+ public static class InputState extends RowFilterInputState {
// try both DecimalColumnVector and Decimal64
@Param({"ORIGINAL"})
@@ -72,79 +62,19 @@ public class TimestampRowFilterBenchmark extends org.openjdk.jmh.Main {
@Param({"2"})
public int filterColsNum;
- Configuration conf = new Configuration();
- FileSystem fs;
- TypeDescription schema;
- VectorizedRowBatch batch;
- Path path;
- boolean[] include;
- Reader reader;
- Reader.Options readerOptions;
- String filter_column = "vendor_id";
+ String dataRelativePath = "data/generated/taxi/orc.none";
- @Setup
- public void setup() throws IOException {
- fs = FileSystem.getLocal(conf).getRaw();
- path = new Path(root, "data/generated/taxi/orc.none");
- schema = Utilities.loadSchema("taxi.schema");
- batch = schema.createRowBatch(version, 1024);
- include = new boolean[schema.getMaximumId() + 1];
- for(TypeDescription child: schema.getChildren()) {
- if (schema.getFieldNames().get(child.getId()-1).compareTo(filter_column) == 0) {
- System.out.println("Apply Filter on column: " + schema.getFieldNames().get(child.getId()-1));
- include[child.getId()] = true;
- } else if (child.getCategory() == benchType) {
- System.out.println("Skip column(s): " + schema.getFieldNames().get(child.getId()-1));
- include[child.getId()] = true;
- if (--filterColsNum == 0) break;
- }
- }
- if (filterColsNum != 0) {
- System.err.println("Dataset does not contain type: "+ benchType);
- System.exit(-1);
- }
- generateRandomSet(Double.parseDouble(filterPerc));
- reader = OrcFile.createReader(path,
- OrcFile.readerOptions(conf).filesystem(fs));
- // just read the Timestamp columns
- readerOptions = reader.options().include(include);
- }
+ String schemaName = "taxi.schema";
- static boolean[] filterValues = null;
- public static boolean[] generateRandomSet(double percentage) throws IllegalArgumentException {
- if (percentage > 1.0) {
- throw new IllegalArgumentException("Filter percentage must be < 1.0 but was "+ percentage);
- }
- filterValues = new boolean[1024];
- int count = 0;
- while (count < (1024 * percentage)) {
- Random randomGenerator = new Random();
- int randVal = randomGenerator.nextInt(1024);
- if (filterValues[randVal] == false) {
- filterValues[randVal] = true;
- count++;
- }
- }
- return filterValues;
- }
+ String filterColumn = "vendor_id";
- public static void customIntRowFilter(OrcFilterContext batch) {
- int newSize = 0;
- for (int row = 0; row < batch.getSelectedSize(); ++row) {
- if (filterValues[row]) {
- batch.getSelected()[newSize++] = row;
- }
- }
- batch.setSelectedInUse(true);
- batch.setSelectedSize(newSize);
- }
}
@Benchmark
public void readOrcRowFilter(Blackhole blackhole, InputState state) throws Exception {
RecordReader rows =
state.reader.rows(state.readerOptions
- .setRowFilter(new String[]{state.filter_column}, InputState::customIntRowFilter));
+ .setRowFilter(new String[]{state.filterColumn}, state::customIntRowFilter));
while (rows.nextBatch(state.batch)) {
blackhole.consume(state.batch);
}