You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@orc.apache.org by prasanthj <gi...@git.apache.org> on 2018/05/03 00:37:28 UTC
[GitHub] orc pull request #252: ORC-345 Create and use a new Decimal64StatisticsImpl ...
Github user prasanthj commented on a diff in the pull request:
https://github.com/apache/orc/pull/252#discussion_r185673722
--- Diff: java/bench/src/java/org/apache/orc/bench/DecimalBench.java ---
@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench;
+
+import com.google.gson.JsonStreamParser;
+import org.apache.avro.file.DataFileReader;
+import org.apache.avro.generic.GenericDatumReader;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.io.DatumReader;
+import org.apache.avro.mapred.FsInput;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.TrackingLocalFileSystem;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport;
+import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper;
+import org.apache.hadoop.io.ArrayWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+import org.apache.orc.bench.convert.BatchReader;
+import org.apache.orc.bench.convert.GenerateVariants;
+import org.apache.orc.bench.convert.csv.CsvReader;
+import org.apache.parquet.hadoop.ParquetInputFormat;
+import org.openjdk.jmh.annotations.AuxCounters;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URI;
+import java.nio.charset.StandardCharsets;
+import java.util.concurrent.TimeUnit;
+
+@BenchmarkMode(Mode.AverageTime)
+@Warmup(iterations=2, time=30, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations=10, time=30, timeUnit = TimeUnit.SECONDS)
+@State(Scope.Thread)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@Fork(2)
+public class DecimalBench {
+
+ private static final String ROOT_ENVIRONMENT_NAME = "bench.root.dir";
+ private static final Path root;
+ static {
+ String value = System.getProperty(ROOT_ENVIRONMENT_NAME);
+ root = value == null ? null : new Path(value);
+ }
+
+ /**
+ * Abstract out whether we are writing short or long decimals
+ */
+ interface Loader {
+ /**
+ * Load the data from the values array into the ColumnVector.
+ * @param vector the output
+ * @param values the intput
+ * @param offset the first input value
+ * @param length the number of values to copy
+ */
+ void loadData(ColumnVector vector, long[] values, int offset, int length);
+ }
+
+ static class Decimal64Loader implements Loader {
+ final int scale;
+ final int precision;
+
+ Decimal64Loader(int precision, int scale) {
+ this.precision = precision;
+ this.scale = scale;
+ }
+
+ @Override
+ public void loadData(ColumnVector vector, long[] values, int offset, int length) {
+ Decimal64ColumnVector v = (Decimal64ColumnVector) vector;
+ v.ensureSize(length, false);
+ v.noNulls = true;
+ for(int p=0; p < length; ++p) {
+ v.vector[p] = values[p + offset];
+ }
+ v.precision = (short) precision;
+ v.scale = (short) scale;
+ }
+ }
+
+ static class DecimalLoader implements Loader {
+ final int scale;
+ final int precision;
+
+ DecimalLoader(int precision, int scale) {
+ this.precision = precision;
+ this.scale = scale;
+ }
+
+ @Override
+ public void loadData(ColumnVector vector, long[] values, int offset, int length) {
+ DecimalColumnVector v = (DecimalColumnVector) vector;
+ v.noNulls = true;
+ for(int p=0; p < length; ++p) {
+ v.vector[p].setFromLongAndScale(values[offset + p], scale);
+ }
+ v.precision = (short) precision;
+ v.scale = (short) scale;
+ }
+ }
+
+ @State(Scope.Thread)
+ public static class OutputState {
+
+ // try both short and long decimals
+ @Param({"8", "19"})
+ public int precision;
+
+ long[] total_amount = new long[1024 * 1024];
+ Configuration conf = new Configuration();
+ FileSystem fs = new NullFileSystem();
+ TypeDescription schema;
+ VectorizedRowBatch batch;
+ Loader loader;
+
+ @Setup
+ public void setup() throws IOException {
+ schema = TypeDescription.createDecimal()
+ .withScale(2)
+ .withPrecision(precision);
+ loader = precision <= 18 ?
+ new Decimal64Loader(precision, 2) :
+ new DecimalLoader(precision, 2);
+ readCsvData(total_amount, root, "total_amount", conf);
+ batch = schema.createRowBatchV2();
+ }
+ }
+
+ @Benchmark
+ public void write(OutputState state) throws Exception {
+ Writer writer = OrcFile.createWriter(new Path("null"),
--- End diff --
Should the writer creation moved out of the @Benchmark method? I think we might unnecessarily measure IO creating writer.
---