You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by br...@apache.org on 2015/01/22 02:26:33 UTC

svn commit: r1653746 - in /hive/trunk/itests: ./ hive-jmh/ hive-jmh/src/ hive-jmh/src/main/ hive-jmh/src/main/java/ hive-jmh/src/main/java/org/ hive-jmh/src/main/java/org/apache/ hive-jmh/src/main/java/org/apache/hive/ hive-jmh/src/main/java/org/apache...

Author: brock
Date: Thu Jan 22 01:26:32 2015
New Revision: 1653746

URL: http://svn.apache.org/r1653746
Log:
HIVE-8121 - Create micro-benchmarks for ParquetSerde and evaluate performance (Sergio via Brock)

Added:
    hive/trunk/itests/hive-jmh/
    hive/trunk/itests/hive-jmh/pom.xml
    hive/trunk/itests/hive-jmh/src/
    hive/trunk/itests/hive-jmh/src/main/
    hive/trunk/itests/hive-jmh/src/main/java/
    hive/trunk/itests/hive-jmh/src/main/java/org/
    hive/trunk/itests/hive-jmh/src/main/java/org/apache/
    hive/trunk/itests/hive-jmh/src/main/java/org/apache/hive/
    hive/trunk/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/
    hive/trunk/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/storage/
    hive/trunk/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/storage/ColumnarStorageBench.java
Modified:
    hive/trunk/itests/pom.xml

Added: hive/trunk/itests/hive-jmh/pom.xml
URL: http://svn.apache.org/viewvc/hive/trunk/itests/hive-jmh/pom.xml?rev=1653746&view=auto
==============================================================================
--- hive/trunk/itests/hive-jmh/pom.xml (added)
+++ hive/trunk/itests/hive-jmh/pom.xml Thu Jan 22 01:26:32 2015
@@ -0,0 +1,121 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.hive</groupId>
+    <artifactId>hive-it</artifactId>
+    <version>0.15.0-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <groupId>org.apache.hive</groupId>
+  <artifactId>hive-jmh</artifactId>
+  <packaging>jar</packaging>
+
+  <name>JMH benchmark: Hive</name>
+
+  <properties>
+    <hive.path.to.root>../..</hive.path.to.root>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <jmh.version>1.4.1</jmh.version>
+    <uberjar.name>benchmarks</uberjar.name>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.openjdk.jmh</groupId>
+      <artifactId>jmh-core</artifactId>
+      <version>${jmh.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.openjdk.jmh</groupId>
+      <artifactId>jmh-generator-annprocess</artifactId>
+      <version>${jmh.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hive</groupId>
+      <artifactId>hive-serde</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hive</groupId>
+      <artifactId>hive-exec</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+
+  <profiles>
+    <profile>
+      <id>hadoop-2</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-common</artifactId>
+          <version>${hadoop-23.version}</version>
+        </dependency>
+        <dependency>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-mapreduce-client-core</artifactId>
+          <version>${hadoop-23.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
+  </profiles>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <version>2.2</version>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+            <configuration>
+              <finalName>${uberjar.name}</finalName>
+              <transformers>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                  <mainClass>org.openjdk.jmh.Main</mainClass>
+                </transformer>
+              </transformers>
+              <filters>
+                <filter>
+                  <!--
+                  Shading signed JARs will fail without this.
+                  http://stackoverflow.com/questions/999489/invalid-signature-file-when-attempting-to-run-a-jar
+                  -->
+                  <artifact>*:*</artifact>
+                  <excludes>
+                    <exclude>META-INF/*.SF</exclude>
+                    <exclude>META-INF/*.DSA</exclude>
+                    <exclude>META-INF/*.RSA</exclude>
+                  </excludes>
+                </filter>
+              </filters>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>

Added: hive/trunk/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/storage/ColumnarStorageBench.java
URL: http://svn.apache.org/viewvc/hive/trunk/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/storage/ColumnarStorageBench.java?rev=1653746&view=auto
==============================================================================
--- hive/trunk/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/storage/ColumnarStorageBench.java (added)
+++ hive/trunk/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/storage/ColumnarStorageBench.java Thu Jan 22 01:26:32 2015
@@ -0,0 +1,436 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.benchmark.storage;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
+import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
+import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
+import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat;
+import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
+import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat;
+import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
+import org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector;
+import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
+import org.apache.hadoop.hive.serde2.SerDe;
+import org.apache.hadoop.hive.serde2.SerDeException;
+import org.apache.hadoop.hive.serde2.SerDeUtils;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.BytesWritable;
+
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.InputFormat;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordReader;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import java.util.Properties;
+import java.util.List;
+import java.util.Random;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+@State(Scope.Benchmark)
+public class ColumnarStorageBench {
+ /**
+  * This test measures the performance between different columnar storage formats used
+  * by Hive. If you need to add more formats, see the 'format' gobal variable to add
+  * a new one on the list, and create a class that implements StorageFormatTest interface.
+  *
+  * This test uses JMH framework for benchmarking.
+  * You may execute this benchmark tool using JMH command line in different ways:
+  *
+  * To use the settings shown in the main() function, use:
+  * $ java -cp target/benchmarks.jar org.apache.hive.benchmark.storage.ColumnarStorageBench
+  *
+  * To use the default settings used by JMH, use:
+  * $ java -jar target/benchmarks.jar org.apache.hive.benchmark.storage ColumnStorageBench
+  *
+  * To specify different parameters, use:
+  * - This command will use 10 warm-up iterations, 5 test iterations, and 2 forks. And it will
+  *   display the Average Time (avgt) in Microseconds (us)
+  * - Benchmark mode. Available modes are:
+  *   [Throughput/thrpt, AverageTime/avgt, SampleTime/sample, SingleShotTime/ss, All/all]
+  * - Output time unit. Available time units are: [m, s, ms, us, ns].
+  *
+  * $ java -jar target/benchmarks.jar org.apache.hive.benchmark.storage ColumnStorageBench -wi 10 -i 5 -f 2 -bm avgt -tu us
+  */
+
+  private static final String DEFAULT_TEMP_LOCATION = "/tmp";
+
+  private File writeFile, readFile, recordWriterFile;
+  private Path writePath, readPath, recordWriterPath;
+  private FileSystem fs;
+
+  /**
+   * Contains implementation for the storage format to test
+   */
+  private StorageFormatTest storageFormatTest;
+
+  private RecordWriter recordWriter;
+  private RecordReader recordReader;
+
+  /**
+   * These objects contains the record to be tested.
+   */
+  private Writable recordWritable[];
+  private Object rows[];
+  private StructObjectInspector oi;
+
+  /**
+   * These column types are used for the record that will be tested.
+   */
+  private Properties recordProperties;
+  private String DEFAULT_COLUMN_TYPES = "int,double,boolean,string,array<int>,map<string,string>,struct<a:int,b:int>";
+
+  public ColumnarStorageBench() {
+    recordProperties = new Properties();
+    recordProperties.setProperty("columns", getColumnNames(DEFAULT_COLUMN_TYPES));
+    recordProperties.setProperty("columns.types", DEFAULT_COLUMN_TYPES);
+
+    oi = getObjectInspector(DEFAULT_COLUMN_TYPES);
+
+    final int NUMBER_OF_ROWS_TO_TEST = 100;
+    rows = new Object[NUMBER_OF_ROWS_TO_TEST];
+    recordWritable = new Writable[NUMBER_OF_ROWS_TO_TEST];
+
+    for (int i=0; i<NUMBER_OF_ROWS_TO_TEST; i++) {
+      rows[i] = createRandomRow(DEFAULT_COLUMN_TYPES);
+    }
+  }
+
+  private String getColumnNames(final String columnTypes) {
+    StringBuilder columnNames = new StringBuilder();
+
+    /* Construct a string of column names based on the number of column types */
+    List<TypeInfo> columnTypesList = TypeInfoUtils.getTypeInfosFromTypeString(columnTypes);
+    for (int i=0; i < columnTypesList.size(); i++) {
+      if (i > 0) {
+        columnNames.append(",");
+      }
+      columnNames.append("c" + i);
+    }
+
+    return columnNames.toString();
+  }
+
+  private long fileLength(Path path) throws IOException {
+    return fs.getFileStatus(path).getLen();
+  }
+
+  private ArrayWritable record(Writable... fields) {
+    return new ArrayWritable(Writable.class, fields);
+  }
+
+  private Writable getPrimitiveWritable(final PrimitiveTypeInfo typeInfo) {
+    Random rand = new Random();
+
+    switch (typeInfo.getPrimitiveCategory()) {
+      case INT:
+        return new IntWritable(rand.nextInt());
+      case DOUBLE:
+        return new DoubleWritable(rand.nextDouble());
+      case BOOLEAN:
+        return new BooleanWritable(rand.nextBoolean());
+      case CHAR:
+      case VARCHAR:
+      case STRING:
+        byte b[] = new byte[30];
+        rand.nextBytes(b);
+        return new BytesWritable(b);
+      default:
+        throw new IllegalArgumentException("Invalid primitive type: " + typeInfo.getTypeName());
+    }
+  }
+
+  private ArrayWritable createRecord(final List<TypeInfo> columnTypes) {
+    Writable[] fields = new Writable[columnTypes.size()];
+
+    int pos=0;
+    for (TypeInfo type : columnTypes) {
+      switch (type.getCategory()) {
+        case PRIMITIVE:
+          fields[pos++] = getPrimitiveWritable((PrimitiveTypeInfo)type);
+        break;
+        case LIST: {
+          List<TypeInfo> elementType = new ArrayList<TypeInfo>();
+          elementType.add(((ListTypeInfo) type).getListElementTypeInfo());
+          fields[pos++] = record(createRecord(elementType));
+        } break;
+        case MAP: {
+          List<TypeInfo> keyValueType = new ArrayList<TypeInfo>();
+          keyValueType.add(((MapTypeInfo) type).getMapKeyTypeInfo());
+          keyValueType.add(((MapTypeInfo) type).getMapValueTypeInfo());
+          fields[pos++] = record(record(createRecord(keyValueType)));
+        } break;
+        case STRUCT: {
+          List<TypeInfo> elementType = ((StructTypeInfo) type).getAllStructFieldTypeInfos();
+          fields[pos++] = createRecord(elementType);
+        } break;
+        default:
+          throw new IllegalStateException("Invalid column type: " + type);
+      }
+    }
+
+    return record(fields);
+  }
+
+  private StructObjectInspector getObjectInspector(final String columnTypes) {
+    List<TypeInfo> columnTypeList = TypeInfoUtils.getTypeInfosFromTypeString(columnTypes);
+    List<String> columnNameList = Arrays.asList(getColumnNames(columnTypes).split(","));
+    StructTypeInfo rowTypeInfo = (StructTypeInfo)TypeInfoFactory.getStructTypeInfo(columnNameList, columnTypeList);
+
+    return new ArrayWritableObjectInspector(rowTypeInfo);
+  }
+
+  private Object createRandomRow(final String columnTypes) {
+    return createRecord(TypeInfoUtils.getTypeInfosFromTypeString(columnTypes));
+  }
+
+  /**
+   * This class encapsulates all methods that will be called by each of the @Benchmark
+   * methods.
+   */
+  private class StorageFormatTest {
+    private SerDe serDe;
+    private JobConf jobConf;
+    private HiveOutputFormat outputFormat;
+    private InputFormat inputFormat;
+
+    public StorageFormatTest(SerDe serDeImpl, HiveOutputFormat outputFormatImpl, InputFormat inputFormatImpl) throws SerDeException {
+      jobConf = new JobConf();
+      serDe = serDeImpl;
+      outputFormat = outputFormatImpl;
+      inputFormat = inputFormatImpl;
+
+      Configuration conf = new Configuration();
+      SerDeUtils.initializeSerDe(serDe, conf, recordProperties, null);
+    }
+
+    public Writable serialize(Object row, StructObjectInspector oi) throws SerDeException {
+      return serDe.serialize(row, oi);
+    }
+
+    public Object deserialize(Writable record) throws SerDeException {
+      return serDe.deserialize(record);
+    }
+
+    /* We write many records because sometimes the RecordWriter for the format to test
+     * behaves different with one record than a bunch of records */
+    public void writeRecords(RecordWriter writer, Writable records[]) throws IOException {
+      for (int i=0; i < records.length; i++) {
+        writer.write(records[i]);
+      }
+    }
+
+    /* We read many records because sometimes the RecordReader for the format to test
+     * behaves different with one record than a bunch of records */
+    public Object readRecords(RecordReader reader) throws IOException {
+      Object alwaysNull = reader.createKey();
+      Object record = reader.createValue();
+
+      // Just loop through all values. We do not need to store anything though.
+      // This is just for test purposes
+      while (reader.next(alwaysNull, record)) ;
+
+      return record;
+    }
+
+    public RecordWriter getRecordWriter(Path outputPath) throws IOException {
+      return outputFormat.getHiveRecordWriter(jobConf, outputPath, null, false, recordProperties, null);
+    }
+
+    public RecordReader getRecordReader(Path inputPath) throws IOException {
+      return inputFormat.getRecordReader(
+          new FileSplit(inputPath, 0, fileLength(inputPath), (String[]) null),
+          jobConf, null);
+    }
+  }
+
+  /**
+   * This class is called to run I/O parquet tests.
+   */
+  private class ParquetStorageFormatTest extends StorageFormatTest {
+    public ParquetStorageFormatTest() throws SerDeException {
+      super(new ParquetHiveSerDe(), new MapredParquetOutputFormat(), new MapredParquetInputFormat());
+    }
+  }
+
+  /**
+   * This class is called to run i/o orc tests.
+   */
+  private class OrcStorageFormatTest extends StorageFormatTest {
+    public OrcStorageFormatTest() throws SerDeException {
+      super(new OrcSerde(), new OrcOutputFormat(), new OrcInputFormat());
+    }
+  }
+
+  private File createTempFile() throws IOException {
+    if (URI.create(DEFAULT_TEMP_LOCATION).getScheme() != null) {
+      throw new IOException("Cannot create temporary files in a non-local file-system: Operation not permitted.");
+    }
+
+    File temp = File.createTempFile(this.toString(), null, new File(DEFAULT_TEMP_LOCATION));
+    temp.deleteOnExit();
+    temp.delete();
+
+    return temp;
+  }
+
+  // Test different format types
+  @Param({"orc", "parquet"})
+  public String format;
+
+  /**
+   * Initializes resources that will be needed for each of the benchmark tests.
+   *
+   * @throws SerDeException If it cannot initialize the desired test format.
+   * @throws IOException If it cannot write data to temporary files.
+   */
+  @Setup(Level.Trial)
+  public void prepareBenchmark() throws SerDeException, IOException {
+    if (format.equalsIgnoreCase("parquet")) {
+      storageFormatTest = new ParquetStorageFormatTest();
+    } else if (format.equalsIgnoreCase("orc")) {
+      storageFormatTest = new OrcStorageFormatTest();
+    } else {
+      throw new IllegalArgumentException("Invalid file format argument: " + format);
+    }
+
+    for (int i=0; i < rows.length; i++) {
+      recordWritable[i] = storageFormatTest.serialize(rows[i], oi);
+    }
+
+    fs = FileSystem.getLocal(new Configuration());
+
+    writeFile = createTempFile();
+    writePath = new Path(writeFile.getPath());
+
+    readFile = createTempFile();
+    readPath = new Path(readFile.getPath());
+
+    /*
+     * Write a bunch of random rows that will be used for read benchmark.
+     */
+    RecordWriter writer = storageFormatTest.getRecordWriter(readPath);
+    storageFormatTest.writeRecords(writer, recordWritable);
+    writer.close(false);
+  }
+
+  /**
+   * It deletes any temporary file created by prepareBenchmark.
+   */
+  @TearDown(Level.Trial)
+  public void cleanUpBenchmark() {
+    readFile.delete();
+  }
+
+  /**
+   * This method is invoked before every call to the methods to test. It creates
+   * resources that are needed for each call (not in a benchmark level).
+   *
+   * @throws IOException If it cannot writes temporary files.
+   */
+  @Setup(Level.Invocation)
+  public void prepareInvocation() throws IOException {
+    recordWriterFile = createTempFile();
+    recordWriterPath = new Path(recordWriterFile.getPath());
+
+    recordWriter = storageFormatTest.getRecordWriter(writePath);
+    recordReader = storageFormatTest.getRecordReader(readPath);
+  }
+
+  /**
+   * This method is invoked after every call to the methods to test. It closes
+   * and cleans up all temporary files.
+   *
+   * @throws IOException If it cannot close or delete temporary files.
+   */
+  @TearDown(Level.Invocation)
+  public void cleanUpInvocation() throws IOException {
+    recordWriter.close(false);
+    recordReader.close();
+
+    recordWriterFile.delete();
+    writeFile.delete();
+  }
+
+  @Benchmark
+  public void write() throws IOException {
+    storageFormatTest.writeRecords(recordWriter, recordWritable);
+  }
+
+  @Benchmark
+  public Object read() throws IOException {
+    return storageFormatTest.readRecords(recordReader);
+  }
+
+  @Benchmark
+  public Writable serialize() throws SerDeException {
+    return storageFormatTest.serialize(rows[0], oi);
+  }
+
+  @Benchmark
+  public Object deserialize() throws SerDeException {
+    return storageFormatTest.deserialize(recordWritable[0]);
+  }
+
+  @Benchmark
+  public RecordWriter getRecordWriter() throws IOException {
+    return storageFormatTest.getRecordWriter(recordWriterPath);
+  }
+
+  @Benchmark
+  public RecordReader getRecordReader() throws IOException {
+    return storageFormatTest.getRecordReader(readPath);
+  }
+
+  public static void main(String args[]) throws Exception {
+    Options opt = new OptionsBuilder()
+        .include(ColumnarStorageBench.class.getSimpleName())
+        .warmupIterations(1)
+        .measurementIterations(1)
+        .forks(1)
+        .build();
+
+    new Runner(opt).run();
+  }
+}

Modified: hive/trunk/itests/pom.xml
URL: http://svn.apache.org/viewvc/hive/trunk/itests/pom.xml?rev=1653746&r1=1653745&r2=1653746&view=diff
==============================================================================
--- hive/trunk/itests/pom.xml (original)
+++ hive/trunk/itests/pom.xml Thu Jan 22 01:26:32 2015
@@ -38,6 +38,7 @@
    <module>util</module>
    <module>test-serde</module>
    <module>qtest</module>
+   <module>hive-jmh</module>
   </modules>
 
   <profiles>