You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/08/14 22:24:09 UTC

[orc] branch branch-1.7 updated: ORC-927: Extracting duplicate codes for RowFilterBenchmark (#841)

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-1.7
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/branch-1.7 by this push:
     new 4052587  ORC-927: Extracting duplicate codes for RowFilterBenchmark (#841)
4052587 is described below

commit 405258771330b50a3857e2795d7e93d8719c85fa
Author: guiyanakaung <gu...@gmail.com>
AuthorDate: Tue Aug 10 12:03:03 2021 +0800

    ORC-927: Extracting duplicate codes for RowFilterBenchmark (#841)
    
    ### What changes were proposed in this pull request?
    
    Create a new class RowFilterInputState to hold duplicate code.
    `filterValues`, `generateRandomSet` and `customIntRowFilter` previously existed in different internal classes as static variables/methods and do not affect each other. After refactoring, they were changed to instance variables/methods to avoid interaction between different benchmarks.
    
    ### Why are the changes needed?
    
    Easier to maintain.
    
    ### How was this patch tested?
    
    Pass the CIs.
    
    (cherry picked from commit 33fcc2637393c2088860106a22f70532b7562be6)
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
 java/bench/hive/src/findbugs/exclude.xml           |   2 +-
 .../hive/rowfilter/BooleanRowFilterBenchmark.java  |  83 +--------------
 .../hive/rowfilter/DecimalRowFilterBenchmark.java  |  83 +--------------
 .../hive/rowfilter/DoubleRowFilterBenchmark.java   |  82 +--------------
 .../bench/hive/rowfilter/RowFilterInputState.java  | 116 +++++++++++++++++++++
 .../hive/rowfilter/StringRowFilterBenchmark.java   |  83 +--------------
 .../rowfilter/TimestampRowFilterBenchmark.java     |  80 +-------------
 7 files changed, 142 insertions(+), 387 deletions(-)

diff --git a/java/bench/hive/src/findbugs/exclude.xml b/java/bench/hive/src/findbugs/exclude.xml
index 64f2e31..6fec73c 100644
--- a/java/bench/hive/src/findbugs/exclude.xml
+++ b/java/bench/hive/src/findbugs/exclude.xml
@@ -14,7 +14,7 @@
 -->
 <FindBugsFilter>
   <Match>
-    <Bug pattern="EI_EXPOSE_REP,EI_EXPOSE_REP2,MS_EXPOSE_REP"/>
+    <Bug pattern="EI_EXPOSE_REP,EI_EXPOSE_REP2,MS_EXPOSE_REP,DM_EXIT"/>
   </Match>
   <Match>
     <Class name="~org\.openjdk\.jmh\.infra\.generated.*"/>
diff --git a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/BooleanRowFilterBenchmark.java b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/BooleanRowFilterBenchmark.java
index c360e45..606d5c5 100644
--- a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/BooleanRowFilterBenchmark.java
+++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/BooleanRowFilterBenchmark.java
@@ -17,16 +17,8 @@
  */
 package org.apache.orc.bench.hive.rowfilter;
 
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.orc.OrcFile;
-import org.apache.orc.Reader;
 import org.apache.orc.RecordReader;
 import org.apache.orc.TypeDescription;
-import org.apache.orc.bench.core.Utilities;
-import org.apache.orc.OrcFilterContext;
 import org.openjdk.jmh.annotations.Benchmark;
 import org.openjdk.jmh.annotations.BenchmarkMode;
 import org.openjdk.jmh.annotations.Fork;
@@ -35,7 +27,6 @@ import org.openjdk.jmh.annotations.Mode;
 import org.openjdk.jmh.annotations.OutputTimeUnit;
 import org.openjdk.jmh.annotations.Param;
 import org.openjdk.jmh.annotations.Scope;
-import org.openjdk.jmh.annotations.Setup;
 import org.openjdk.jmh.annotations.State;
 import org.openjdk.jmh.annotations.Warmup;
 import org.openjdk.jmh.infra.Blackhole;
@@ -43,8 +34,6 @@ import org.openjdk.jmh.runner.Runner;
 import org.openjdk.jmh.runner.RunnerException;
 import org.openjdk.jmh.runner.options.OptionsBuilder;
 
-import java.io.IOException;
-import java.util.Random;
 import java.util.concurrent.TimeUnit;
 
 @State(Scope.Benchmark)
@@ -55,10 +44,8 @@ import java.util.concurrent.TimeUnit;
 @OutputTimeUnit(TimeUnit.MILLISECONDS)
 public class BooleanRowFilterBenchmark extends org.openjdk.jmh.Main {
 
-  private static final Path root = new Path(System.getProperty("user.dir"));
-
   @State(Scope.Thread)
-  public static class InputState {
+  public static class InputState extends RowFilterInputState {
 
     @Param({"ORIGINAL"})
     public TypeDescription.RowBatchVersion version;
@@ -72,79 +59,19 @@ public class BooleanRowFilterBenchmark extends org.openjdk.jmh.Main {
     @Param({"2"})
     public int filterColsNum;
 
-    Configuration conf = new Configuration();
-    FileSystem fs;
-    TypeDescription schema;
-    VectorizedRowBatch batch;
-    Path path;
-    boolean[] include;
-    Reader reader;
-    Reader.Options readerOptions;
-    String filter_column = "sales_id";
+    String dataRelativePath = "data/generated/sales/orc.none";
 
-    @Setup
-    public void setup() throws IOException {
-      fs = FileSystem.getLocal(conf).getRaw();
-      path = new Path(root, "data/generated/sales/orc.none");
-      schema = Utilities.loadSchema("sales.schema");
-      batch = schema.createRowBatch(version, 1024);
-      include = new boolean[schema.getMaximumId() + 1];
-      for(TypeDescription child: schema.getChildren()) {
-        if (schema.getFieldNames().get(child.getId()-1).compareTo(filter_column) == 0) {
-          System.out.println("Apply Filter on column: " + schema.getFieldNames().get(child.getId()-1));
-          include[child.getId()] = true;
-        } else if (child.getCategory() == benchType) {
-          System.out.println("Skip column(s): " + schema.getFieldNames().get(child.getId()-1));
-          include[child.getId()] = true;
-          if (--filterColsNum == 0) break;
-        }
-      }
-      if (filterColsNum != 0) {
-        System.err.println("Dataset does not contain type: "+ benchType);
-        System.exit(-1);
-      }
-      generateRandomSet(Double.parseDouble(filterPerc));
-      reader = OrcFile.createReader(path,
-          OrcFile.readerOptions(conf).filesystem(fs));
-      // just read the Boolean columns
-      readerOptions = reader.options().include(include);
-    }
+    String schemaName = "sales.schema";
 
-    static boolean[] filterValues = null;
-    public static boolean[] generateRandomSet(double percentage) throws IllegalArgumentException {
-      if (percentage > 1.0) {
-        throw new IllegalArgumentException("Filter percentage must be < 1.0 but was "+ percentage);
-      }
-      filterValues = new boolean[1024];
-      int count = 0;
-      while (count < (1024 * percentage)) {
-        Random randomGenerator = new Random();
-        int randVal = randomGenerator.nextInt(1024);
-        if (filterValues[randVal] == false) {
-          filterValues[randVal] = true;
-          count++;
-        }
-      }
-      return filterValues;
-    }
+    String filterColumn = "sales_id";
 
-    public static void customIntRowFilter(OrcFilterContext batch) {
-      int newSize = 0;
-      for (int row = 0; row < batch.getSelectedSize(); ++row) {
-        if (filterValues[row]) {
-          batch.getSelected()[newSize++] = row;
-        }
-      }
-      batch.setSelectedInUse(true);
-      batch.setSelectedSize(newSize);
-    }
   }
 
   @Benchmark
   public void readOrcRowFilter(Blackhole blackhole, InputState state) throws Exception {
     RecordReader rows =
         state.reader.rows(state.readerOptions
-            .setRowFilter(new String[]{state.filter_column}, InputState::customIntRowFilter));
+            .setRowFilter(new String[]{state.filterColumn}, state::customIntRowFilter));
     while (rows.nextBatch(state.batch)) {
       blackhole.consume(state.batch);
     }
diff --git a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/DecimalRowFilterBenchmark.java b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/DecimalRowFilterBenchmark.java
index 526f333..59fddf6 100644
--- a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/DecimalRowFilterBenchmark.java
+++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/DecimalRowFilterBenchmark.java
@@ -17,16 +17,8 @@
  */
 package org.apache.orc.bench.hive.rowfilter;
 
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.orc.OrcFile;
-import org.apache.orc.Reader;
 import org.apache.orc.RecordReader;
 import org.apache.orc.TypeDescription;
-import org.apache.orc.bench.core.Utilities;
-import org.apache.orc.OrcFilterContext;
 import org.openjdk.jmh.annotations.Benchmark;
 import org.openjdk.jmh.annotations.BenchmarkMode;
 import org.openjdk.jmh.annotations.Fork;
@@ -35,7 +27,6 @@ import org.openjdk.jmh.annotations.Mode;
 import org.openjdk.jmh.annotations.OutputTimeUnit;
 import org.openjdk.jmh.annotations.Param;
 import org.openjdk.jmh.annotations.Scope;
-import org.openjdk.jmh.annotations.Setup;
 import org.openjdk.jmh.annotations.State;
 import org.openjdk.jmh.annotations.Warmup;
 import org.openjdk.jmh.infra.Blackhole;
@@ -43,8 +34,6 @@ import org.openjdk.jmh.runner.Runner;
 import org.openjdk.jmh.runner.RunnerException;
 import org.openjdk.jmh.runner.options.OptionsBuilder;
 
-import java.io.IOException;
-import java.util.Random;
 import java.util.concurrent.TimeUnit;
 
 @State(Scope.Benchmark)
@@ -55,10 +44,8 @@ import java.util.concurrent.TimeUnit;
 @OutputTimeUnit(TimeUnit.MILLISECONDS)
 public class DecimalRowFilterBenchmark extends org.openjdk.jmh.Main {
 
-  private static final Path root = new Path(System.getProperty("user.dir"));
-
   @State(Scope.Thread)
-  public static class InputState {
+  public static class InputState extends RowFilterInputState {
 
     // try both DecimalColumnVector and Decimal64
     @Param({"ORIGINAL", "USE_DECIMAL64"})
@@ -73,79 +60,19 @@ public class DecimalRowFilterBenchmark extends org.openjdk.jmh.Main {
     @Param({"2"})
     public int filterColsNum;
 
-    Configuration conf = new Configuration();
-    FileSystem fs;
-    TypeDescription schema;
-    VectorizedRowBatch batch;
-    Path path;
-    boolean[] include;
-    Reader reader;
-    Reader.Options readerOptions;
-    String filter_column = "vendor_id";
+    String dataRelativePath = "data/generated/taxi/orc.none";
 
-    @Setup
-    public void setup() throws IOException {
-      fs = FileSystem.getLocal(conf).getRaw();
-      path = new Path(root, "data/generated/taxi/orc.none");
-      schema = Utilities.loadSchema("taxi.schema");
-      batch = schema.createRowBatch(version, 1024);
-      include = new boolean[schema.getMaximumId() + 1];
-      for(TypeDescription child: schema.getChildren()) {
-        if (schema.getFieldNames().get(child.getId()-1).compareTo(filter_column) == 0) {
-          System.out.println("Apply Filter on column: " + schema.getFieldNames().get(child.getId()-1));
-          include[child.getId()] = true;
-        } else if (child.getCategory() == benchType) {
-          System.out.println("Skip column(s): " + schema.getFieldNames().get(child.getId()-1));
-          include[child.getId()] = true;
-          if (--filterColsNum == 0) break;
-        }
-      }
-      if (filterColsNum != 0) {
-        System.err.println("Dataset does not contain type: "+ benchType);
-        System.exit(-1);
-      }
-      generateRandomSet(Double.parseDouble(filterPerc));
-      reader = OrcFile.createReader(path,
-          OrcFile.readerOptions(conf).filesystem(fs));
-      // just read the Decimal columns
-      readerOptions = reader.options().include(include);
-    }
+    String schemaName = "taxi.schema";
 
-    static boolean[] filterValues = null;
-    public static boolean[] generateRandomSet(double percentage) throws IllegalArgumentException {
-      if (percentage > 1.0) {
-        throw new IllegalArgumentException("Filter percentage must be < 1.0 but was "+ percentage);
-      }
-      filterValues = new boolean[1024];
-      int count = 0;
-      while (count < (1024 * percentage)) {
-        Random randomGenerator = new Random();
-        int randVal = randomGenerator.nextInt(1024);
-        if (filterValues[randVal] == false) {
-          filterValues[randVal] = true;
-          count++;
-        }
-      }
-      return filterValues;
-    }
+    String filterColumn = "vendor_id";
 
-    public static void customIntRowFilter(OrcFilterContext batch) {
-      int newSize = 0;
-      for (int row = 0; row < batch.getSelectedSize(); ++row) {
-        if (filterValues[row]) {
-          batch.getSelected()[newSize++] = row;
-        }
-      }
-      batch.setSelectedInUse(true);
-      batch.setSelectedSize(newSize);
-    }
   }
 
   @Benchmark
   public void readOrcRowFilter(Blackhole blackhole, InputState state) throws Exception {
     RecordReader rows =
         state.reader.rows(state.readerOptions
-            .setRowFilter(new String[]{state.filter_column}, InputState::customIntRowFilter));
+            .setRowFilter(new String[]{state.filterColumn}, state::customIntRowFilter));
     while (rows.nextBatch(state.batch)) {
       blackhole.consume(state.batch);
     }
diff --git a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/DoubleRowFilterBenchmark.java b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/DoubleRowFilterBenchmark.java
index d6b9a4d..5ce87b8 100644
--- a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/DoubleRowFilterBenchmark.java
+++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/DoubleRowFilterBenchmark.java
@@ -17,16 +17,8 @@
  */
 package org.apache.orc.bench.hive.rowfilter;
 
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.orc.OrcFile;
-import org.apache.orc.Reader;
 import org.apache.orc.RecordReader;
 import org.apache.orc.TypeDescription;
-import org.apache.orc.bench.core.Utilities;
-import org.apache.orc.OrcFilterContext;
 import org.openjdk.jmh.annotations.Benchmark;
 import org.openjdk.jmh.annotations.BenchmarkMode;
 import org.openjdk.jmh.annotations.Fork;
@@ -35,7 +27,6 @@ import org.openjdk.jmh.annotations.Mode;
 import org.openjdk.jmh.annotations.OutputTimeUnit;
 import org.openjdk.jmh.annotations.Param;
 import org.openjdk.jmh.annotations.Scope;
-import org.openjdk.jmh.annotations.Setup;
 import org.openjdk.jmh.annotations.State;
 import org.openjdk.jmh.annotations.Warmup;
 import org.openjdk.jmh.infra.Blackhole;
@@ -43,8 +34,6 @@ import org.openjdk.jmh.runner.Runner;
 import org.openjdk.jmh.runner.RunnerException;
 import org.openjdk.jmh.runner.options.OptionsBuilder;
 
-import java.io.IOException;
-import java.util.Random;
 import java.util.concurrent.TimeUnit;
 
 @State(Scope.Benchmark)
@@ -54,10 +43,9 @@ import java.util.concurrent.TimeUnit;
 @BenchmarkMode(Mode.AverageTime)
 @OutputTimeUnit(TimeUnit.MILLISECONDS)
 public class DoubleRowFilterBenchmark extends org.openjdk.jmh.Main {
-  private static final Path root = new Path(System.getProperty("user.dir"));
 
   @State(Scope.Thread)
-  public static class InputState {
+  public static class InputState extends RowFilterInputState {
 
     // try both DecimalColumnVector and Decimal64
     @Param({"ORIGINAL"})
@@ -72,79 +60,19 @@ public class DoubleRowFilterBenchmark extends org.openjdk.jmh.Main {
     @Param({"2"})
     public int filterColsNum;
 
-    Configuration conf = new Configuration();
-    FileSystem fs;
-    TypeDescription schema;
-    VectorizedRowBatch batch;
-    Path path;
-    boolean[] include;
-    Reader reader;
-    Reader.Options readerOptions;
-    String filter_column = "vendor_id";
+    String dataRelativePath = "data/generated/taxi/orc.none";
 
-    @Setup
-    public void setup() throws IOException {
-      fs = FileSystem.getLocal(conf).getRaw();
-      path = new Path(root, "data/generated/taxi/orc.none");
-      schema = Utilities.loadSchema("taxi.schema");
-      batch = schema.createRowBatch(version, 1024);
-      include = new boolean[schema.getMaximumId() + 1];
-      for(TypeDescription child: schema.getChildren()) {
-        if (schema.getFieldNames().get(child.getId()-1).compareTo(filter_column) == 0) {
-          System.out.println("Apply Filter on column: " + schema.getFieldNames().get(child.getId()-1));
-          include[child.getId()] = true;
-        } else if (child.getCategory() == benchType) {
-          System.out.println("Skip column(s): " + schema.getFieldNames().get(child.getId()-1));
-          include[child.getId()] = true;
-          if (--filterColsNum == 0) break;
-        }
-      }
-      if (filterColsNum != 0) {
-        System.err.println("Dataset does not contain type: "+ benchType);
-        System.exit(-1);
-      }
-      generateRandomSet(Double.parseDouble(filterPerc));
-      reader = OrcFile.createReader(path,
-          OrcFile.readerOptions(conf).filesystem(fs));
-      // just read the Double columns
-      readerOptions = reader.options().include(include);
-    }
+    String schemaName = "taxi.schema";
 
-    static boolean[] filterValues = null;
-    public static boolean[] generateRandomSet(double percentage) throws IllegalArgumentException {
-      if (percentage > 1.0) {
-        throw new IllegalArgumentException("Filter percentage must be < 1.0 but was "+ percentage);
-      }
-      filterValues = new boolean[1024];
-      int count = 0;
-      while (count < (1024 * percentage)) {
-        Random randomGenerator = new Random();
-        int randVal = randomGenerator.nextInt(1024);
-        if (filterValues[randVal] == false) {
-          filterValues[randVal] = true;
-          count++;
-        }
-      }
-      return filterValues;
-    }
+    String filterColumn = "vendor_id";
 
-    public static void customIntRowFilter(OrcFilterContext batch) {
-      int newSize = 0;
-      for (int row = 0; row < batch.getSelectedSize(); ++row) {
-        if (filterValues[row]) {
-          batch.getSelected()[newSize++] = row;
-        }
-      }
-      batch.setSelectedInUse(true);
-      batch.setSelectedSize(newSize);
-    }
   }
 
   @Benchmark
   public void readOrcRowFilter(Blackhole blackhole, InputState state) throws Exception {
     RecordReader rows =
         state.reader.rows(state.readerOptions
-            .setRowFilter(new String[]{state.filter_column}, InputState::customIntRowFilter));
+            .setRowFilter(new String[]{state.filterColumn}, state::customIntRowFilter));
     while (rows.nextBatch(state.batch)) {
       blackhole.consume(state.batch);
     }
diff --git a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/RowFilterInputState.java b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/RowFilterInputState.java
new file mode 100644
index 0000000..dba28f8
--- /dev/null
+++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/RowFilterInputState.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.bench.hive.rowfilter;
+
+import org.apache.commons.lang.reflect.FieldUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.OrcFile;
+import org.apache.orc.OrcFilterContext;
+import org.apache.orc.Reader;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.core.Utilities;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+
+import java.io.IOException;
+import java.util.Random;
+
+@State(Scope.Thread)
+public abstract class RowFilterInputState {
+
+  private static final Path root = new Path(System.getProperty("user.dir"));
+
+  Configuration conf = new Configuration();
+  FileSystem fs;
+  TypeDescription schema;
+  VectorizedRowBatch batch;
+  Path path;
+  boolean[] include;
+  Reader reader;
+  Reader.Options readerOptions;
+  boolean[] filterValues = null;
+
+  @Setup
+  public void setup() throws IOException, IllegalAccessException {
+    TypeDescription.RowBatchVersion version =
+        (TypeDescription.RowBatchVersion) FieldUtils.readField(this, "version", true);
+    TypeDescription.Category benchType = (TypeDescription.Category) FieldUtils.readField(this, "benchType", true);
+    String filterPerc = (String) FieldUtils.readField(this, "filterPerc", true);
+    int filterColsNum = (int) FieldUtils.readField(this, "filterColsNum", true);
+    String dataRelativePath = (String) FieldUtils.readField(this, "dataRelativePath", true);
+    String schemaName = (String) FieldUtils.readField(this, "schemaName", true);
+    String filterColumn = (String) FieldUtils.readField(this, "filterColumn", true);
+
+    fs = FileSystem.getLocal(conf).getRaw();
+    path = new Path(root, dataRelativePath);
+    schema = Utilities.loadSchema(schemaName);
+    batch = schema.createRowBatch(version, 1024);
+    include = new boolean[schema.getMaximumId() + 1];
+    for (TypeDescription child : schema.getChildren()) {
+      if (schema.getFieldNames().get(child.getId() - 1).compareTo(filterColumn) == 0) {
+        System.out.println("Apply Filter on column: " + schema.getFieldNames().get(child.getId() - 1));
+        include[child.getId()] = true;
+      } else if (child.getCategory() == benchType) {
+        System.out.println("Skip column(s): " + schema.getFieldNames().get(child.getId() - 1));
+        include[child.getId()] = true;
+        if (--filterColsNum == 0) break;
+      }
+    }
+    if (filterColsNum != 0) {
+      System.err.println("Dataset does not contain type: " + benchType);
+      System.exit(-1);
+    }
+    generateRandomSet(Double.parseDouble(filterPerc));
+    reader = OrcFile.createReader(path,
+        OrcFile.readerOptions(conf).filesystem(fs));
+    // just read the Boolean columns
+    readerOptions = reader.options().include(include);
+  }
+
+  public void generateRandomSet(double percentage) throws IllegalArgumentException {
+    if (percentage > 1.0) {
+      throw new IllegalArgumentException("Filter percentage must be < 1.0 but was " + percentage);
+    }
+    filterValues = new boolean[1024];
+    int count = 0;
+    while (count < (1024 * percentage)) {
+      Random randomGenerator = new Random();
+      int randVal = randomGenerator.nextInt(1024);
+      if (!filterValues[randVal]) {
+        filterValues[randVal] = true;
+        count++;
+      }
+    }
+  }
+
+  public void customIntRowFilter(OrcFilterContext batch) {
+    int newSize = 0;
+    for (int row = 0; row < batch.getSelectedSize(); ++row) {
+      if (filterValues[row]) {
+        batch.getSelected()[newSize++] = row;
+      }
+    }
+    batch.setSelectedInUse(true);
+    batch.setSelectedSize(newSize);
+  }
+
+}
diff --git a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/StringRowFilterBenchmark.java b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/StringRowFilterBenchmark.java
index d2f8f48..bc12fbe 100644
--- a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/StringRowFilterBenchmark.java
+++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/StringRowFilterBenchmark.java
@@ -17,16 +17,8 @@
  */
 package org.apache.orc.bench.hive.rowfilter;
 
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.orc.OrcFile;
-import org.apache.orc.Reader;
 import org.apache.orc.RecordReader;
 import org.apache.orc.TypeDescription;
-import org.apache.orc.bench.core.Utilities;
-import org.apache.orc.OrcFilterContext;
 import org.openjdk.jmh.annotations.Benchmark;
 import org.openjdk.jmh.annotations.BenchmarkMode;
 import org.openjdk.jmh.annotations.Fork;
@@ -35,7 +27,6 @@ import org.openjdk.jmh.annotations.Mode;
 import org.openjdk.jmh.annotations.OutputTimeUnit;
 import org.openjdk.jmh.annotations.Param;
 import org.openjdk.jmh.annotations.Scope;
-import org.openjdk.jmh.annotations.Setup;
 import org.openjdk.jmh.annotations.State;
 import org.openjdk.jmh.annotations.Warmup;
 import org.openjdk.jmh.infra.Blackhole;
@@ -43,8 +34,6 @@ import org.openjdk.jmh.runner.Runner;
 import org.openjdk.jmh.runner.RunnerException;
 import org.openjdk.jmh.runner.options.OptionsBuilder;
 
-import java.io.IOException;
-import java.util.Random;
 import java.util.concurrent.TimeUnit;
 
 @State(Scope.Benchmark)
@@ -55,10 +44,8 @@ import java.util.concurrent.TimeUnit;
 @OutputTimeUnit(TimeUnit.MILLISECONDS)
 public class StringRowFilterBenchmark extends org.openjdk.jmh.Main {
 
-  private static final Path root = new Path(System.getProperty("user.dir"));
-
   @State(Scope.Thread)
-  public static class InputState {
+  public static class InputState extends RowFilterInputState {
 
     @Param({"ORIGINAL"})
     public TypeDescription.RowBatchVersion version;
@@ -72,79 +59,19 @@ public class StringRowFilterBenchmark extends org.openjdk.jmh.Main {
     @Param({"2"})
     public int filterColsNum;
 
-    Configuration conf = new Configuration();
-    FileSystem fs;
-    TypeDescription schema;
-    VectorizedRowBatch batch;
-    Path path;
-    boolean[] include;
-    Reader reader;
-    Reader.Options readerOptions;
-    String filter_column = "sales_id";
+    String dataRelativePath = "data/generated/sales/orc.none";
 
-    @Setup
-    public void setup() throws IOException {
-      fs = FileSystem.getLocal(conf).getRaw();
-      path = new Path(root, "data/generated/sales/orc.none");
-      schema = Utilities.loadSchema("sales.schema");
-      batch = schema.createRowBatch(version, 1024);
-      include = new boolean[schema.getMaximumId() + 1];
-      for(TypeDescription child: schema.getChildren()) {
-        if (schema.getFieldNames().get(child.getId()-1).compareTo(filter_column) == 0) {
-          System.out.println("Apply Filter on column: " + schema.getFieldNames().get(child.getId()-1));
-          include[child.getId()] = true;
-        } else if (child.getCategory() == benchType) {
-          System.out.println("Skip column(s): " + schema.getFieldNames().get(child.getId()-1));
-          include[child.getId()] = true;
-          if (--filterColsNum == 0) break;
-        }
-      }
-      if (filterColsNum != 0) {
-        System.err.println("Dataset does not contain type: "+ benchType);
-        System.exit(-1);
-      }
-      generateRandomSet(Double.parseDouble(filterPerc));
-      reader = OrcFile.createReader(path,
-          OrcFile.readerOptions(conf).filesystem(fs));
-      // just read the String columns
-      readerOptions = reader.options().include(include);
-    }
+    String schemaName = "sales.schema";
 
-    static boolean[] filterValues = null;
-    public static boolean[] generateRandomSet(double percentage) throws IllegalArgumentException {
-      if (percentage > 1.0) {
-        throw new IllegalArgumentException("Filter percentage must be < 1.0 but was "+ percentage);
-      }
-      filterValues = new boolean[1024];
-      int count = 0;
-      while (count < (1024 * percentage)) {
-        Random randomGenerator = new Random();
-        int randVal = randomGenerator.nextInt(1024);
-        if (filterValues[randVal] == false) {
-          filterValues[randVal] = true;
-          count++;
-        }
-      }
-      return filterValues;
-    }
+    String filterColumn = "sales_id";
 
-    public static void customIntRowFilter(OrcFilterContext batch) {
-      int newSize = 0;
-      for (int row = 0; row < batch.getSelectedSize(); ++row) {
-        if (filterValues[row]) {
-          batch.getSelected()[newSize++] = row;
-        }
-      }
-      batch.setSelectedInUse(true);
-      batch.setSelectedSize(newSize);
-    }
   }
 
   @Benchmark
   public void readOrcRowFilter(Blackhole blackhole, InputState state) throws Exception {
     RecordReader rows =
         state.reader.rows(state.readerOptions
-            .setRowFilter(new String[]{state.filter_column}, InputState::customIntRowFilter));
+            .setRowFilter(new String[]{state.filterColumn}, state::customIntRowFilter));
     while (rows.nextBatch(state.batch)) {
       blackhole.consume(state.batch);
     }
diff --git a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/TimestampRowFilterBenchmark.java b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/TimestampRowFilterBenchmark.java
index c4994eb..a8aa6ba 100644
--- a/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/TimestampRowFilterBenchmark.java
+++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/rowfilter/TimestampRowFilterBenchmark.java
@@ -17,16 +17,9 @@
  */
 package org.apache.orc.bench.hive.rowfilter;
 
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.orc.OrcFile;
-import org.apache.orc.Reader;
 import org.apache.orc.RecordReader;
 import org.apache.orc.TypeDescription;
-import org.apache.orc.bench.core.Utilities;
-import org.apache.orc.OrcFilterContext;
 import org.openjdk.jmh.annotations.Benchmark;
 import org.openjdk.jmh.annotations.BenchmarkMode;
 import org.openjdk.jmh.annotations.Fork;
@@ -35,7 +28,6 @@ import org.openjdk.jmh.annotations.Mode;
 import org.openjdk.jmh.annotations.OutputTimeUnit;
 import org.openjdk.jmh.annotations.Param;
 import org.openjdk.jmh.annotations.Scope;
-import org.openjdk.jmh.annotations.Setup;
 import org.openjdk.jmh.annotations.State;
 import org.openjdk.jmh.annotations.Warmup;
 import org.openjdk.jmh.infra.Blackhole;
@@ -43,8 +35,6 @@ import org.openjdk.jmh.runner.Runner;
 import org.openjdk.jmh.runner.RunnerException;
 import org.openjdk.jmh.runner.options.OptionsBuilder;
 
-import java.io.IOException;
-import java.util.Random;
 import java.util.concurrent.TimeUnit;
 
 @State(Scope.Benchmark)
@@ -57,7 +47,7 @@ public class TimestampRowFilterBenchmark extends org.openjdk.jmh.Main {
   private static final Path root = new Path(System.getProperty("user.dir"));
 
   @State(Scope.Thread)
-  public static class InputState {
+  public static class InputState extends RowFilterInputState {
 
     // try both DecimalColumnVector and Decimal64
     @Param({"ORIGINAL"})
@@ -72,79 +62,19 @@ public class TimestampRowFilterBenchmark extends org.openjdk.jmh.Main {
     @Param({"2"})
     public int filterColsNum;
 
-    Configuration conf = new Configuration();
-    FileSystem fs;
-    TypeDescription schema;
-    VectorizedRowBatch batch;
-    Path path;
-    boolean[] include;
-    Reader reader;
-    Reader.Options readerOptions;
-    String filter_column = "vendor_id";
+    String dataRelativePath = "data/generated/taxi/orc.none";
 
-    @Setup
-    public void setup() throws IOException {
-      fs = FileSystem.getLocal(conf).getRaw();
-      path = new Path(root, "data/generated/taxi/orc.none");
-      schema = Utilities.loadSchema("taxi.schema");
-      batch = schema.createRowBatch(version, 1024);
-      include = new boolean[schema.getMaximumId() + 1];
-      for(TypeDescription child: schema.getChildren()) {
-        if (schema.getFieldNames().get(child.getId()-1).compareTo(filter_column) == 0) {
-          System.out.println("Apply Filter on column: " + schema.getFieldNames().get(child.getId()-1));
-          include[child.getId()] = true;
-        } else if (child.getCategory() == benchType) {
-          System.out.println("Skip column(s): " + schema.getFieldNames().get(child.getId()-1));
-          include[child.getId()] = true;
-          if (--filterColsNum == 0) break;
-        }
-      }
-      if (filterColsNum != 0) {
-        System.err.println("Dataset does not contain type: "+ benchType);
-        System.exit(-1);
-      }
-      generateRandomSet(Double.parseDouble(filterPerc));
-      reader = OrcFile.createReader(path,
-          OrcFile.readerOptions(conf).filesystem(fs));
-      // just read the Timestamp columns
-      readerOptions = reader.options().include(include);
-    }
+    String schemaName = "taxi.schema";
 
-    static boolean[] filterValues = null;
-    public static boolean[] generateRandomSet(double percentage) throws IllegalArgumentException {
-      if (percentage > 1.0) {
-        throw new IllegalArgumentException("Filter percentage must be < 1.0 but was "+ percentage);
-      }
-     filterValues = new boolean[1024];
-      int count = 0;
-      while (count < (1024 * percentage)) {
-        Random randomGenerator = new Random();
-        int randVal = randomGenerator.nextInt(1024);
-        if (filterValues[randVal] == false) {
-          filterValues[randVal] = true;
-          count++;
-        }
-      }
-      return filterValues;
-    }
+    String filterColumn = "vendor_id";
 
-    public static void customIntRowFilter(OrcFilterContext batch) {
-      int newSize = 0;
-      for (int row = 0; row < batch.getSelectedSize(); ++row) {
-        if (filterValues[row]) {
-          batch.getSelected()[newSize++] = row;
-        }
-      }
-      batch.setSelectedInUse(true);
-      batch.setSelectedSize(newSize);
-    }
   }
 
   @Benchmark
   public void readOrcRowFilter(Blackhole blackhole, InputState state) throws Exception {
     RecordReader rows =
         state.reader.rows(state.readerOptions
-            .setRowFilter(new String[]{state.filter_column}, InputState::customIntRowFilter));
+            .setRowFilter(new String[]{state.filterColumn}, state::customIntRowFilter));
     while (rows.nextBatch(state.batch)) {
       blackhole.consume(state.batch);
     }