You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by om...@apache.org on 2016/05/20 21:22:51 UTC
[13/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java
deleted file mode 100644
index 9c2f88f..0000000
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java
+++ /dev/null
@@ -1,884 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.io.orc;
-
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.PrintStream;
-import java.text.DecimalFormat;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.GnuParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.OptionBuilder;
-import org.apache.commons.cli.Options;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-import org.apache.hadoop.hdfs.DistributedFileSystem;
-import org.apache.hadoop.hive.ql.io.AcidUtils;
-import org.apache.orc.BloomFilterIO;
-import org.apache.hadoop.hive.serde2.io.ByteWritable;
-import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.hive.serde2.io.ShortWritable;
-import org.apache.hadoop.io.BooleanWritable;
-import org.apache.hadoop.io.FloatWritable;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.TypeDescription;
-import org.apache.orc.impl.ColumnStatisticsImpl;
-import org.apache.orc.impl.OrcIndex;
-import org.apache.orc.OrcProto;
-import org.apache.orc.StripeInformation;
-import org.apache.orc.StripeStatistics;
-import org.codehaus.jettison.json.JSONException;
-import org.codehaus.jettison.json.JSONWriter;
-
-import com.google.common.base.Joiner;
-import com.google.common.base.Strings;
-import com.google.common.collect.Lists;
-
-/**
- * A tool for printing out the file structure of ORC files.
- */
-public final class FileDump {
- public static final String UNKNOWN = "UNKNOWN";
- public static final String SEPARATOR = Strings.repeat("_", 120) + "\n";
- public static final int DEFAULT_BLOCK_SIZE = 256 * 1024 * 1024;
- public static final String DEFAULT_BACKUP_PATH = System.getProperty("java.io.tmpdir");
- public static final PathFilter HIDDEN_AND_SIDE_FILE_FILTER = new PathFilter() {
- public boolean accept(Path p) {
- String name = p.getName();
- return !name.startsWith("_") && !name.startsWith(".") && !name.endsWith(
- AcidUtils.DELTA_SIDE_FILE_SUFFIX);
- }
- };
-
- // not used
- private FileDump() {
- }
-
- public static void main(String[] args) throws Exception {
- Configuration conf = new Configuration();
-
- List<Integer> rowIndexCols = null;
- Options opts = createOptions();
- CommandLine cli = new GnuParser().parse(opts, args);
-
- if (cli.hasOption('h')) {
- HelpFormatter formatter = new HelpFormatter();
- formatter.printHelp("orcfiledump", opts);
- return;
- }
-
- boolean dumpData = cli.hasOption('d');
- boolean recover = cli.hasOption("recover");
- boolean skipDump = cli.hasOption("skip-dump");
- String backupPath = DEFAULT_BACKUP_PATH;
- if (cli.hasOption("backup-path")) {
- backupPath = cli.getOptionValue("backup-path");
- }
-
- if (cli.hasOption("r")) {
- String[] colStrs = cli.getOptionValue("r").split(",");
- rowIndexCols = new ArrayList<Integer>(colStrs.length);
- for (String colStr : colStrs) {
- rowIndexCols.add(Integer.parseInt(colStr));
- }
- }
-
- boolean printTimeZone = cli.hasOption('t');
- boolean jsonFormat = cli.hasOption('j');
- String[] files = cli.getArgs();
- if (files.length == 0) {
- System.err.println("Error : ORC files are not specified");
- return;
- }
-
- // if the specified path is directory, iterate through all files and print the file dump
- List<String> filesInPath = Lists.newArrayList();
- for (String filename : files) {
- Path path = new Path(filename);
- filesInPath.addAll(getAllFilesInPath(path, conf));
- }
-
- if (dumpData) {
- printData(filesInPath, conf);
- } else if (recover && skipDump) {
- recoverFiles(filesInPath, conf, backupPath);
- } else {
- if (jsonFormat) {
- boolean prettyPrint = cli.hasOption('p');
- JsonFileDump.printJsonMetaData(filesInPath, conf, rowIndexCols, prettyPrint, printTimeZone);
- } else {
- printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, backupPath);
- }
- }
- }
-
- /**
- * This method returns an ORC reader object if the specified file is readable. If the specified
- * file has side file (_flush_length) file, then max footer offset will be read from the side
- * file and orc reader will be created from that offset. Since both data file and side file
- * use hflush() for flushing the data, there could be some inconsistencies and both files could be
- * out-of-sync. Following are the cases under which null will be returned
- *
- * 1) If the file specified by path or its side file is still open for writes
- * 2) If *_flush_length file does not return any footer offset
- * 3) If *_flush_length returns a valid footer offset but the data file is not readable at that
- * position (incomplete data file)
- * 4) If *_flush_length file length is not a multiple of 8, then reader will be created from
- * previous valid footer. If there is no such footer (file length > 0 and < 8), then null will
- * be returned
- *
- * Also, if this method detects any file corruption (mismatch between data file and side file)
- * then it will add the corresponding file to the specified input list for corrupted files.
- *
- * In all other cases, where the file is readable this method will return a reader object.
- *
- * @param path - file to get reader for
- * @param conf - configuration object
- * @param corruptFiles - fills this list with all possible corrupted files
- * @return - reader for the specified file or null
- * @throws IOException
- */
- static Reader getReader(final Path path, final Configuration conf,
- final List<String> corruptFiles) throws IOException {
- FileSystem fs = path.getFileSystem(conf);
- long dataFileLen = fs.getFileStatus(path).getLen();
- System.err.println("Processing data file " + path + " [length: " + dataFileLen + "]");
- Path sideFile = OrcRecordUpdater.getSideFile(path);
- final boolean sideFileExists = fs.exists(sideFile);
- boolean openDataFile = false;
- boolean openSideFile = false;
- if (fs instanceof DistributedFileSystem) {
- DistributedFileSystem dfs = (DistributedFileSystem) fs;
- openDataFile = !dfs.isFileClosed(path);
- openSideFile = sideFileExists && !dfs.isFileClosed(sideFile);
- }
-
- if (openDataFile || openSideFile) {
- if (openDataFile && openSideFile) {
- System.err.println("Unable to perform file dump as " + path + " and " + sideFile +
- " are still open for writes.");
- } else if (openSideFile) {
- System.err.println("Unable to perform file dump as " + sideFile +
- " is still open for writes.");
- } else {
- System.err.println("Unable to perform file dump as " + path +
- " is still open for writes.");
- }
-
- return null;
- }
-
- Reader reader = null;
- if (sideFileExists) {
- final long maxLen = OrcRawRecordMerger.getLastFlushLength(fs, path);
- final long sideFileLen = fs.getFileStatus(sideFile).getLen();
- System.err.println("Found flush length file " + sideFile
- + " [length: " + sideFileLen + ", maxFooterOffset: " + maxLen + "]");
- // no offsets read from side file
- if (maxLen == -1) {
-
- // if data file is larger than last flush length, then additional data could be recovered
- if (dataFileLen > maxLen) {
- System.err.println("Data file has more data than max footer offset:" + maxLen +
- ". Adding data file to recovery list.");
- if (corruptFiles != null) {
- corruptFiles.add(path.toUri().toString());
- }
- }
- return null;
- }
-
- try {
- reader = OrcFile.createReader(path, OrcFile.readerOptions(conf).maxLength(maxLen));
-
- // if data file is larger than last flush length, then additional data could be recovered
- if (dataFileLen > maxLen) {
- System.err.println("Data file has more data than max footer offset:" + maxLen +
- ". Adding data file to recovery list.");
- if (corruptFiles != null) {
- corruptFiles.add(path.toUri().toString());
- }
- }
- } catch (Exception e) {
- if (corruptFiles != null) {
- corruptFiles.add(path.toUri().toString());
- }
- System.err.println("Unable to read data from max footer offset." +
- " Adding data file to recovery list.");
- return null;
- }
- } else {
- reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
- }
-
- return reader;
- }
-
- public static Collection<String> getAllFilesInPath(final Path path,
- final Configuration conf) throws IOException {
- List<String> filesInPath = Lists.newArrayList();
- FileSystem fs = path.getFileSystem(conf);
- FileStatus fileStatus = fs.getFileStatus(path);
- if (fileStatus.isDir()) {
- FileStatus[] fileStatuses = fs.listStatus(path, HIDDEN_AND_SIDE_FILE_FILTER);
- for (FileStatus fileInPath : fileStatuses) {
- if (fileInPath.isDir()) {
- filesInPath.addAll(getAllFilesInPath(fileInPath.getPath(), conf));
- } else {
- filesInPath.add(fileInPath.getPath().toString());
- }
- }
- } else {
- filesInPath.add(path.toString());
- }
-
- return filesInPath;
- }
-
- private static void printData(List<String> files,
- Configuration conf) throws IOException,
- JSONException {
- for (String file : files) {
- try {
- Path path = new Path(file);
- Reader reader = getReader(path, conf, Lists.<String>newArrayList());
- if (reader == null) {
- continue;
- }
- printJsonData(reader);
- System.out.println(SEPARATOR);
- } catch (Exception e) {
- System.err.println("Unable to dump data for file: " + file);
- continue;
- }
- }
- }
-
- private static void printMetaData(List<String> files, Configuration conf,
- List<Integer> rowIndexCols, boolean printTimeZone, final boolean recover,
- final String backupPath)
- throws IOException {
- List<String> corruptFiles = Lists.newArrayList();
- for (String filename : files) {
- printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, corruptFiles);
- System.out.println(SEPARATOR);
- }
-
- if (!corruptFiles.isEmpty()) {
- if (recover) {
- recoverFiles(corruptFiles, conf, backupPath);
- } else {
- System.err.println(corruptFiles.size() + " file(s) are corrupted." +
- " Run the following command to recover corrupted files.\n");
- String fileNames = Joiner.on(" ").skipNulls().join(corruptFiles);
- System.err.println("hive --orcfiledump --recover --skip-dump " + fileNames);
- System.out.println(SEPARATOR);
- }
- }
- }
-
- private static void printMetaDataImpl(final String filename,
- final Configuration conf, final List<Integer> rowIndexCols, final boolean printTimeZone,
- final List<String> corruptFiles) throws IOException {
- Path file = new Path(filename);
- Reader reader = getReader(file, conf, corruptFiles);
- // if we can create reader then footer is not corrupt and file will readable
- if (reader == null) {
- return;
- }
-
- System.out.println("Structure for " + filename);
- System.out.println("File Version: " + reader.getFileVersion().getName() +
- " with " + reader.getWriterVersion());
- RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
- System.out.println("Rows: " + reader.getNumberOfRows());
- System.out.println("Compression: " + reader.getCompression());
- if (reader.getCompression() != CompressionKind.NONE) {
- System.out.println("Compression size: " + reader.getCompressionSize());
- }
- System.out.println("Type: " + reader.getObjectInspector().getTypeName());
- System.out.println("\nStripe Statistics:");
- List<StripeStatistics> stripeStats = reader.getStripeStatistics();
- for (int n = 0; n < stripeStats.size(); n++) {
- System.out.println(" Stripe " + (n + 1) + ":");
- StripeStatistics ss = stripeStats.get(n);
- for (int i = 0; i < ss.getColumnStatistics().length; ++i) {
- System.out.println(" Column " + i + ": " +
- ss.getColumnStatistics()[i].toString());
- }
- }
- ColumnStatistics[] stats = reader.getStatistics();
- int colCount = stats.length;
- System.out.println("\nFile Statistics:");
- for (int i = 0; i < stats.length; ++i) {
- System.out.println(" Column " + i + ": " + stats[i].toString());
- }
- System.out.println("\nStripes:");
- int stripeIx = -1;
- for (StripeInformation stripe : reader.getStripes()) {
- ++stripeIx;
- long stripeStart = stripe.getOffset();
- OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
- if (printTimeZone) {
- String tz = footer.getWriterTimezone();
- if (tz == null || tz.isEmpty()) {
- tz = UNKNOWN;
- }
- System.out.println(" Stripe: " + stripe.toString() + " timezone: " + tz);
- } else {
- System.out.println(" Stripe: " + stripe.toString());
- }
- long sectionStart = stripeStart;
- for (OrcProto.Stream section : footer.getStreamsList()) {
- String kind = section.hasKind() ? section.getKind().name() : UNKNOWN;
- System.out.println(" Stream: column " + section.getColumn() +
- " section " + kind + " start: " + sectionStart +
- " length " + section.getLength());
- sectionStart += section.getLength();
- }
- for (int i = 0; i < footer.getColumnsCount(); ++i) {
- OrcProto.ColumnEncoding encoding = footer.getColumns(i);
- StringBuilder buf = new StringBuilder();
- buf.append(" Encoding column ");
- buf.append(i);
- buf.append(": ");
- buf.append(encoding.getKind());
- if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
- encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
- buf.append("[");
- buf.append(encoding.getDictionarySize());
- buf.append("]");
- }
- System.out.println(buf);
- }
- if (rowIndexCols != null && !rowIndexCols.isEmpty()) {
- // include the columns that are specified, only if the columns are included, bloom filter
- // will be read
- boolean[] sargColumns = new boolean[colCount];
- for (int colIdx : rowIndexCols) {
- sargColumns[colIdx] = true;
- }
- OrcIndex indices = rows
- .readRowIndex(stripeIx, null, null, null, sargColumns);
- for (int col : rowIndexCols) {
- StringBuilder buf = new StringBuilder();
- String rowIdxString = getFormattedRowIndices(col, indices.getRowGroupIndex());
- buf.append(rowIdxString);
- String bloomFilString = getFormattedBloomFilters(col, indices.getBloomFilterIndex());
- buf.append(bloomFilString);
- System.out.println(buf);
- }
- }
- }
-
- FileSystem fs = file.getFileSystem(conf);
- long fileLen = fs.getFileStatus(file).getLen();
- long paddedBytes = getTotalPaddingSize(reader);
- // empty ORC file is ~45 bytes. Assumption here is file length always >0
- double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
- DecimalFormat format = new DecimalFormat("##.##");
- System.out.println("\nFile length: " + fileLen + " bytes");
- System.out.println("Padding length: " + paddedBytes + " bytes");
- System.out.println("Padding ratio: " + format.format(percentPadding) + "%");
- OrcRecordUpdater.AcidStats acidStats = OrcRecordUpdater.parseAcidStats(reader);
- if (acidStats != null) {
- System.out.println("ACID stats:" + acidStats);
- }
- rows.close();
- }
-
- private static void recoverFiles(final List<String> corruptFiles, final Configuration conf,
- final String backup)
- throws IOException {
- for (String corruptFile : corruptFiles) {
- System.err.println("Recovering file " + corruptFile);
- Path corruptPath = new Path(corruptFile);
- FileSystem fs = corruptPath.getFileSystem(conf);
- FSDataInputStream fdis = fs.open(corruptPath);
- try {
- long corruptFileLen = fs.getFileStatus(corruptPath).getLen();
- long remaining = corruptFileLen;
- List<Long> footerOffsets = Lists.newArrayList();
-
- // start reading the data file form top to bottom and record the valid footers
- while (remaining > 0) {
- int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
- byte[] data = new byte[toRead];
- long startPos = corruptFileLen - remaining;
- fdis.readFully(startPos, data, 0, toRead);
-
- // find all MAGIC string and see if the file is readable from there
- int index = 0;
- long nextFooterOffset;
-
- while (index != -1) {
- index = indexOf(data, OrcFile.MAGIC.getBytes(), index + 1);
- if (index != -1) {
- nextFooterOffset = startPos + index + OrcFile.MAGIC.length() + 1;
- if (isReadable(corruptPath, conf, nextFooterOffset)) {
- footerOffsets.add(nextFooterOffset);
- }
- }
- }
-
- System.err.println("Scanning for valid footers - startPos: " + startPos +
- " toRead: " + toRead + " remaining: " + remaining);
- remaining = remaining - toRead;
- }
-
- System.err.println("Readable footerOffsets: " + footerOffsets);
- recoverFile(corruptPath, fs, conf, footerOffsets, backup);
- } catch (Exception e) {
- Path recoveryFile = getRecoveryFile(corruptPath);
- if (fs.exists(recoveryFile)) {
- fs.delete(recoveryFile, false);
- }
- System.err.println("Unable to recover file " + corruptFile);
- e.printStackTrace();
- System.err.println(SEPARATOR);
- continue;
- } finally {
- fdis.close();
- }
- System.err.println(corruptFile + " recovered successfully!");
- System.err.println(SEPARATOR);
- }
- }
-
- private static void recoverFile(final Path corruptPath, final FileSystem fs,
- final Configuration conf, final List<Long> footerOffsets, final String backup)
- throws IOException {
-
- // first recover the file to .recovered file and then once successful rename it to actual file
- Path recoveredPath = getRecoveryFile(corruptPath);
-
- // make sure that file does not exist
- if (fs.exists(recoveredPath)) {
- fs.delete(recoveredPath, false);
- }
-
- // if there are no valid footers, the file should still be readable so create an empty orc file
- if (footerOffsets == null || footerOffsets.isEmpty()) {
- System.err.println("No readable footers found. Creating empty orc file.");
- TypeDescription schema = TypeDescription.createStruct();
- Writer writer = OrcFile.createWriter(recoveredPath,
- OrcFile.writerOptions(conf).setSchema(schema));
- writer.close();
- } else {
- FSDataInputStream fdis = fs.open(corruptPath);
- FileStatus fileStatus = fs.getFileStatus(corruptPath);
- // read corrupt file and copy it to recovered file until last valid footer
- FSDataOutputStream fdos = fs.create(recoveredPath, true,
- conf.getInt("io.file.buffer.size", 4096),
- fileStatus.getReplication(),
- fileStatus.getBlockSize());
- try {
- long fileLen = footerOffsets.get(footerOffsets.size() - 1);
- long remaining = fileLen;
-
- while (remaining > 0) {
- int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
- byte[] data = new byte[toRead];
- long startPos = fileLen - remaining;
- fdis.readFully(startPos, data, 0, toRead);
- fdos.write(data);
- System.err.println("Copying data to recovery file - startPos: " + startPos +
- " toRead: " + toRead + " remaining: " + remaining);
- remaining = remaining - toRead;
- }
- } catch (Exception e) {
- fs.delete(recoveredPath, false);
- throw new IOException(e);
- } finally {
- fdis.close();
- fdos.close();
- }
- }
-
- // validate the recovered file once again and start moving corrupt files to backup folder
- if (isReadable(recoveredPath, conf, Long.MAX_VALUE)) {
- Path backupDataPath;
- String scheme = corruptPath.toUri().getScheme();
- String authority = corruptPath.toUri().getAuthority();
- String filePath = corruptPath.toUri().getPath();
-
- // use the same filesystem as corrupt file if backup-path is not explicitly specified
- if (backup.equals(DEFAULT_BACKUP_PATH)) {
- backupDataPath = new Path(scheme, authority, DEFAULT_BACKUP_PATH + filePath);
- } else {
- backupDataPath = Path.mergePaths(new Path(backup), corruptPath);
- }
-
- // Move data file to backup path
- moveFiles(fs, corruptPath, backupDataPath);
-
- // Move side file to backup path
- Path sideFilePath = OrcRecordUpdater.getSideFile(corruptPath);
- Path backupSideFilePath = new Path(backupDataPath.getParent(), sideFilePath.getName());
- moveFiles(fs, sideFilePath, backupSideFilePath);
-
- // finally move recovered file to actual file
- moveFiles(fs, recoveredPath, corruptPath);
-
- // we are done recovering, backing up and validating
- System.err.println("Validation of recovered file successful!");
- }
- }
-
- private static void moveFiles(final FileSystem fs, final Path src, final Path dest)
- throws IOException {
- try {
- // create the dest directory if not exist
- if (!fs.exists(dest.getParent())) {
- fs.mkdirs(dest.getParent());
- }
-
- // if the destination file exists for some reason delete it
- fs.delete(dest, false);
-
- if (fs.rename(src, dest)) {
- System.err.println("Moved " + src + " to " + dest);
- } else {
- throw new IOException("Unable to move " + src + " to " + dest);
- }
-
- } catch (Exception e) {
- throw new IOException("Unable to move " + src + " to " + dest, e);
- }
- }
-
- private static Path getRecoveryFile(final Path corruptPath) {
- return new Path(corruptPath.getParent(), corruptPath.getName() + ".recovered");
- }
-
- private static boolean isReadable(final Path corruptPath, final Configuration conf,
- final long maxLen) {
- try {
- OrcFile.createReader(corruptPath, OrcFile.readerOptions(conf).maxLength(maxLen));
- return true;
- } catch (Exception e) {
- // ignore this exception as maxLen is unreadable
- return false;
- }
- }
-
- // search for byte pattern in another byte array
- private static int indexOf(final byte[] data, final byte[] pattern, final int index) {
- if (data == null || data.length == 0 || pattern == null || pattern.length == 0 ||
- index > data.length || index < 0) {
- return -1;
- }
-
- int j = 0;
- for (int i = index; i < data.length; i++) {
- if (pattern[j] == data[i]) {
- j++;
- } else {
- j = 0;
- }
-
- if (j == pattern.length) {
- return i - pattern.length + 1;
- }
- }
-
- return -1;
- }
-
- private static String getFormattedBloomFilters(int col,
- OrcProto.BloomFilterIndex[] bloomFilterIndex) {
- StringBuilder buf = new StringBuilder();
- BloomFilterIO stripeLevelBF = null;
- if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
- int idx = 0;
- buf.append("\n Bloom filters for column ").append(col).append(":");
- for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
- BloomFilterIO toMerge = new BloomFilterIO(bf);
- buf.append("\n Entry ").append(idx++).append(":").append(getBloomFilterStats(toMerge));
- if (stripeLevelBF == null) {
- stripeLevelBF = toMerge;
- } else {
- stripeLevelBF.merge(toMerge);
- }
- }
- String bloomFilterStats = getBloomFilterStats(stripeLevelBF);
- buf.append("\n Stripe level merge:").append(bloomFilterStats);
- }
- return buf.toString();
- }
-
- private static String getBloomFilterStats(BloomFilterIO bf) {
- StringBuilder sb = new StringBuilder();
- int bitCount = bf.getBitSize();
- int popCount = 0;
- for (long l : bf.getBitSet()) {
- popCount += Long.bitCount(l);
- }
- int k = bf.getNumHashFunctions();
- float loadFactor = (float) popCount / (float) bitCount;
- float expectedFpp = (float) Math.pow(loadFactor, k);
- DecimalFormat df = new DecimalFormat("###.####");
- sb.append(" numHashFunctions: ").append(k);
- sb.append(" bitCount: ").append(bitCount);
- sb.append(" popCount: ").append(popCount);
- sb.append(" loadFactor: ").append(df.format(loadFactor));
- sb.append(" expectedFpp: ").append(expectedFpp);
- return sb.toString();
- }
-
- private static String getFormattedRowIndices(int col,
- OrcProto.RowIndex[] rowGroupIndex) {
- StringBuilder buf = new StringBuilder();
- OrcProto.RowIndex index;
- buf.append(" Row group indices for column ").append(col).append(":");
- if (rowGroupIndex == null || (col >= rowGroupIndex.length) ||
- ((index = rowGroupIndex[col]) == null)) {
- buf.append(" not found\n");
- return buf.toString();
- }
-
- for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) {
- buf.append("\n Entry ").append(entryIx).append(": ");
- OrcProto.RowIndexEntry entry = index.getEntry(entryIx);
- if (entry == null) {
- buf.append("unknown\n");
- continue;
- }
- OrcProto.ColumnStatistics colStats = entry.getStatistics();
- if (colStats == null) {
- buf.append("no stats at ");
- } else {
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(colStats);
- buf.append(cs.toString());
- }
- buf.append(" positions: ");
- for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
- if (posIx != 0) {
- buf.append(",");
- }
- buf.append(entry.getPositions(posIx));
- }
- }
- return buf.toString();
- }
-
- public static long getTotalPaddingSize(Reader reader) throws IOException {
- long paddedBytes = 0;
- List<StripeInformation> stripes = reader.getStripes();
- for (int i = 1; i < stripes.size(); i++) {
- long prevStripeOffset = stripes.get(i - 1).getOffset();
- long prevStripeLen = stripes.get(i - 1).getLength();
- paddedBytes += stripes.get(i).getOffset() - (prevStripeOffset + prevStripeLen);
- }
- return paddedBytes;
- }
-
- static Options createOptions() {
- Options result = new Options();
-
- // add -d and --data to print the rows
- result.addOption(OptionBuilder
- .withLongOpt("data")
- .withDescription("Should the data be printed")
- .create('d'));
-
- // to avoid breaking unit tests (when run in different time zones) for file dump, printing
- // of timezone is made optional
- result.addOption(OptionBuilder
- .withLongOpt("timezone")
- .withDescription("Print writer's time zone")
- .create('t'));
-
- result.addOption(OptionBuilder
- .withLongOpt("help")
- .withDescription("print help message")
- .create('h'));
-
- result.addOption(OptionBuilder
- .withLongOpt("rowindex")
- .withArgName("comma separated list of column ids for which row index should be printed")
- .withDescription("Dump stats for column number(s)")
- .hasArg()
- .create('r'));
-
- result.addOption(OptionBuilder
- .withLongOpt("json")
- .withDescription("Print metadata in JSON format")
- .create('j'));
-
- result.addOption(OptionBuilder
- .withLongOpt("pretty")
- .withDescription("Pretty print json metadata output")
- .create('p'));
-
- result.addOption(OptionBuilder
- .withLongOpt("recover")
- .withDescription("recover corrupted orc files generated by streaming")
- .create());
-
- result.addOption(OptionBuilder
- .withLongOpt("skip-dump")
- .withDescription("used along with --recover to directly recover files without dumping")
- .create());
-
- result.addOption(OptionBuilder
- .withLongOpt("backup-path")
- .withDescription("specify a backup path to store the corrupted files (default: /tmp)")
- .hasArg()
- .create());
- return result;
- }
-
- private static void printMap(JSONWriter writer,
- Map<Object, Object> obj,
- List<OrcProto.Type> types,
- OrcProto.Type type
- ) throws IOException, JSONException {
- writer.array();
- int keyType = type.getSubtypes(0);
- int valueType = type.getSubtypes(1);
- for (Map.Entry<Object, Object> item : obj.entrySet()) {
- writer.object();
- writer.key("_key");
- printObject(writer, item.getKey(), types, keyType);
- writer.key("_value");
- printObject(writer, item.getValue(), types, valueType);
- writer.endObject();
- }
- writer.endArray();
- }
-
- private static void printList(JSONWriter writer,
- List<Object> obj,
- List<OrcProto.Type> types,
- OrcProto.Type type
- ) throws IOException, JSONException {
- int subtype = type.getSubtypes(0);
- writer.array();
- for (Object item : obj) {
- printObject(writer, item, types, subtype);
- }
- writer.endArray();
- }
-
- private static void printUnion(JSONWriter writer,
- OrcUnion obj,
- List<OrcProto.Type> types,
- OrcProto.Type type
- ) throws IOException, JSONException {
- int subtype = type.getSubtypes(obj.getTag());
- printObject(writer, obj.getObject(), types, subtype);
- }
-
- static void printStruct(JSONWriter writer,
- OrcStruct obj,
- List<OrcProto.Type> types,
- OrcProto.Type type) throws IOException, JSONException {
- writer.object();
- List<Integer> fieldTypes = type.getSubtypesList();
- for (int i = 0; i < fieldTypes.size(); ++i) {
- writer.key(type.getFieldNames(i));
- printObject(writer, obj.getFieldValue(i), types, fieldTypes.get(i));
- }
- writer.endObject();
- }
-
- static void printObject(JSONWriter writer,
- Object obj,
- List<OrcProto.Type> types,
- int typeId) throws IOException, JSONException {
- OrcProto.Type type = types.get(typeId);
- if (obj == null) {
- writer.value(null);
- } else {
- switch (type.getKind()) {
- case STRUCT:
- printStruct(writer, (OrcStruct) obj, types, type);
- break;
- case UNION:
- printUnion(writer, (OrcUnion) obj, types, type);
- break;
- case LIST:
- printList(writer, (List<Object>) obj, types, type);
- break;
- case MAP:
- printMap(writer, (Map<Object, Object>) obj, types, type);
- break;
- case BYTE:
- writer.value(((ByteWritable) obj).get());
- break;
- case SHORT:
- writer.value(((ShortWritable) obj).get());
- break;
- case INT:
- writer.value(((IntWritable) obj).get());
- break;
- case LONG:
- writer.value(((LongWritable) obj).get());
- break;
- case FLOAT:
- writer.value(((FloatWritable) obj).get());
- break;
- case DOUBLE:
- writer.value(((DoubleWritable) obj).get());
- break;
- case BOOLEAN:
- writer.value(((BooleanWritable) obj).get());
- break;
- default:
- writer.value(obj.toString());
- break;
- }
- }
- }
-
- static void printJsonData(final Reader reader) throws IOException, JSONException {
- PrintStream printStream = System.out;
- OutputStreamWriter out = new OutputStreamWriter(printStream, "UTF-8");
- RecordReader rows = reader.rows(null);
- Object row = null;
- try {
- List<OrcProto.Type> types = reader.getTypes();
- while (rows.hasNext()) {
- row = rows.next(row);
- JSONWriter writer = new JSONWriter(out);
- printObject(writer, row, types, 0);
- out.write("\n");
- out.flush();
- if (printStream.checkError()) {
- throw new IOException("Error encountered when writing to stdout.");
- }
- }
- } finally {
- rows.close();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java
deleted file mode 100644
index 00de545..0000000
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java
+++ /dev/null
@@ -1,401 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.io.orc;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.codehaus.jettison.json.JSONArray;
-import org.apache.orc.BloomFilterIO;
-import org.apache.orc.BinaryColumnStatistics;
-import org.apache.orc.BooleanColumnStatistics;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.impl.ColumnStatisticsImpl;
-import org.apache.orc.DateColumnStatistics;
-import org.apache.orc.DecimalColumnStatistics;
-import org.apache.orc.DoubleColumnStatistics;
-import org.apache.orc.IntegerColumnStatistics;
-import org.apache.orc.impl.OrcIndex;
-import org.apache.orc.OrcProto;
-import org.apache.orc.StringColumnStatistics;
-import org.apache.orc.StripeInformation;
-import org.apache.orc.StripeStatistics;
-import org.apache.orc.TimestampColumnStatistics;
-import org.codehaus.jettison.json.JSONException;
-import org.codehaus.jettison.json.JSONObject;
-import org.codehaus.jettison.json.JSONStringer;
-import org.codehaus.jettison.json.JSONWriter;
-
-/**
- * File dump tool with json formatted output.
- */
-public class JsonFileDump {
-
- public static void printJsonMetaData(List<String> files,
- Configuration conf,
- List<Integer> rowIndexCols, boolean prettyPrint, boolean printTimeZone)
- throws JSONException, IOException {
- if (files.isEmpty()) {
- return;
- }
- JSONStringer writer = new JSONStringer();
- boolean multiFile = files.size() > 1;
- if (multiFile) {
- writer.array();
- } else {
- writer.object();
- }
- for (String filename : files) {
- try {
- if (multiFile) {
- writer.object();
- }
- writer.key("fileName").value(filename);
- Path path = new Path(filename);
- Reader reader = FileDump.getReader(path, conf, null);
- if (reader == null) {
- writer.key("status").value("FAILED");
- continue;
- }
- writer.key("fileVersion").value(reader.getFileVersion().getName());
- writer.key("writerVersion").value(reader.getWriterVersion());
- RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
- writer.key("numberOfRows").value(reader.getNumberOfRows());
- writer.key("compression").value(reader.getCompression());
- if (reader.getCompression() != CompressionKind.NONE) {
- writer.key("compressionBufferSize").value(reader.getCompressionSize());
- }
- writer.key("schemaString").value(reader.getObjectInspector().getTypeName());
- writer.key("schema").array();
- writeSchema(writer, reader.getTypes());
- writer.endArray();
-
- writer.key("stripeStatistics").array();
- List<StripeStatistics> stripeStatistics = reader.getStripeStatistics();
- for (int n = 0; n < stripeStatistics.size(); n++) {
- writer.object();
- writer.key("stripeNumber").value(n + 1);
- StripeStatistics ss = stripeStatistics.get(n);
- writer.key("columnStatistics").array();
- for (int i = 0; i < ss.getColumnStatistics().length; i++) {
- writer.object();
- writer.key("columnId").value(i);
- writeColumnStatistics(writer, ss.getColumnStatistics()[i]);
- writer.endObject();
- }
- writer.endArray();
- writer.endObject();
- }
- writer.endArray();
-
- ColumnStatistics[] stats = reader.getStatistics();
- int colCount = stats.length;
- writer.key("fileStatistics").array();
- for (int i = 0; i < stats.length; ++i) {
- writer.object();
- writer.key("columnId").value(i);
- writeColumnStatistics(writer, stats[i]);
- writer.endObject();
- }
- writer.endArray();
-
- writer.key("stripes").array();
- int stripeIx = -1;
- for (StripeInformation stripe : reader.getStripes()) {
- ++stripeIx;
- long stripeStart = stripe.getOffset();
- OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
- writer.object(); // start of stripe information
- writer.key("stripeNumber").value(stripeIx + 1);
- writer.key("stripeInformation");
- writeStripeInformation(writer, stripe);
- if (printTimeZone) {
- writer.key("writerTimezone").value(
- footer.hasWriterTimezone() ? footer.getWriterTimezone() : FileDump.UNKNOWN);
- }
- long sectionStart = stripeStart;
-
- writer.key("streams").array();
- for (OrcProto.Stream section : footer.getStreamsList()) {
- writer.object();
- String kind = section.hasKind() ? section.getKind().name() : FileDump.UNKNOWN;
- writer.key("columnId").value(section.getColumn());
- writer.key("section").value(kind);
- writer.key("startOffset").value(sectionStart);
- writer.key("length").value(section.getLength());
- sectionStart += section.getLength();
- writer.endObject();
- }
- writer.endArray();
-
- writer.key("encodings").array();
- for (int i = 0; i < footer.getColumnsCount(); ++i) {
- writer.object();
- OrcProto.ColumnEncoding encoding = footer.getColumns(i);
- writer.key("columnId").value(i);
- writer.key("kind").value(encoding.getKind());
- if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
- encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
- writer.key("dictionarySize").value(encoding.getDictionarySize());
- }
- writer.endObject();
- }
- writer.endArray();
-
- if (rowIndexCols != null && !rowIndexCols.isEmpty()) {
- // include the columns that are specified, only if the columns are included, bloom filter
- // will be read
- boolean[] sargColumns = new boolean[colCount];
- for (int colIdx : rowIndexCols) {
- sargColumns[colIdx] = true;
- }
- OrcIndex indices = rows.readRowIndex(stripeIx, null, sargColumns);
- writer.key("indexes").array();
- for (int col : rowIndexCols) {
- writer.object();
- writer.key("columnId").value(col);
- writeRowGroupIndexes(writer, col, indices.getRowGroupIndex());
- writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex());
- writer.endObject();
- }
- writer.endArray();
- }
- writer.endObject(); // end of stripe information
- }
- writer.endArray();
-
- FileSystem fs = path.getFileSystem(conf);
- long fileLen = fs.getContentSummary(path).getLength();
- long paddedBytes = FileDump.getTotalPaddingSize(reader);
- // empty ORC file is ~45 bytes. Assumption here is file length always >0
- double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
- writer.key("fileLength").value(fileLen);
- writer.key("paddingLength").value(paddedBytes);
- writer.key("paddingRatio").value(percentPadding);
- OrcRecordUpdater.AcidStats acidStats = OrcRecordUpdater.parseAcidStats(reader);
- if (acidStats != null) {
- writer.key("numInserts").value(acidStats.inserts);
- writer.key("numDeletes").value(acidStats.deletes);
- writer.key("numUpdates").value(acidStats.updates);
- }
- writer.key("status").value("OK");
- rows.close();
-
- writer.endObject();
- } catch (Exception e) {
- writer.key("status").value("FAILED");
- throw e;
- }
- }
- if (multiFile) {
- writer.endArray();
- }
-
- if (prettyPrint) {
- final String prettyJson;
- if (multiFile) {
- JSONArray jsonArray = new JSONArray(writer.toString());
- prettyJson = jsonArray.toString(2);
- } else {
- JSONObject jsonObject = new JSONObject(writer.toString());
- prettyJson = jsonObject.toString(2);
- }
- System.out.println(prettyJson);
- } else {
- System.out.println(writer.toString());
- }
- }
-
- private static void writeSchema(JSONStringer writer, List<OrcProto.Type> types)
- throws JSONException {
- int i = 0;
- for(OrcProto.Type type : types) {
- writer.object();
- writer.key("columnId").value(i++);
- writer.key("columnType").value(type.getKind());
- if (type.getFieldNamesCount() > 0) {
- writer.key("childColumnNames").array();
- for (String field : type.getFieldNamesList()) {
- writer.value(field);
- }
- writer.endArray();
- writer.key("childColumnIds").array();
- for (Integer colId : type.getSubtypesList()) {
- writer.value(colId);
- }
- writer.endArray();
- }
- if (type.hasPrecision()) {
- writer.key("precision").value(type.getPrecision());
- }
-
- if (type.hasScale()) {
- writer.key("scale").value(type.getScale());
- }
-
- if (type.hasMaximumLength()) {
- writer.key("maxLength").value(type.getMaximumLength());
- }
- writer.endObject();
- }
- }
-
- private static void writeStripeInformation(JSONWriter writer, StripeInformation stripe)
- throws JSONException {
- writer.object();
- writer.key("offset").value(stripe.getOffset());
- writer.key("indexLength").value(stripe.getIndexLength());
- writer.key("dataLength").value(stripe.getDataLength());
- writer.key("footerLength").value(stripe.getFooterLength());
- writer.key("rowCount").value(stripe.getNumberOfRows());
- writer.endObject();
- }
-
- private static void writeColumnStatistics(JSONWriter writer, ColumnStatistics cs)
- throws JSONException {
- if (cs != null) {
- writer.key("count").value(cs.getNumberOfValues());
- writer.key("hasNull").value(cs.hasNull());
- if (cs instanceof BinaryColumnStatistics) {
- writer.key("totalLength").value(((BinaryColumnStatistics) cs).getSum());
- writer.key("type").value(OrcProto.Type.Kind.BINARY);
- } else if (cs instanceof BooleanColumnStatistics) {
- writer.key("trueCount").value(((BooleanColumnStatistics) cs).getTrueCount());
- writer.key("falseCount").value(((BooleanColumnStatistics) cs).getFalseCount());
- writer.key("type").value(OrcProto.Type.Kind.BOOLEAN);
- } else if (cs instanceof IntegerColumnStatistics) {
- writer.key("min").value(((IntegerColumnStatistics) cs).getMinimum());
- writer.key("max").value(((IntegerColumnStatistics) cs).getMaximum());
- if (((IntegerColumnStatistics) cs).isSumDefined()) {
- writer.key("sum").value(((IntegerColumnStatistics) cs).getSum());
- }
- writer.key("type").value(OrcProto.Type.Kind.LONG);
- } else if (cs instanceof DoubleColumnStatistics) {
- writer.key("min").value(((DoubleColumnStatistics) cs).getMinimum());
- writer.key("max").value(((DoubleColumnStatistics) cs).getMaximum());
- writer.key("sum").value(((DoubleColumnStatistics) cs).getSum());
- writer.key("type").value(OrcProto.Type.Kind.DOUBLE);
- } else if (cs instanceof StringColumnStatistics) {
- writer.key("min").value(((StringColumnStatistics) cs).getMinimum());
- writer.key("max").value(((StringColumnStatistics) cs).getMaximum());
- writer.key("totalLength").value(((StringColumnStatistics) cs).getSum());
- writer.key("type").value(OrcProto.Type.Kind.STRING);
- } else if (cs instanceof DateColumnStatistics) {
- if (((DateColumnStatistics) cs).getMaximum() != null) {
- writer.key("min").value(((DateColumnStatistics) cs).getMinimum());
- writer.key("max").value(((DateColumnStatistics) cs).getMaximum());
- }
- writer.key("type").value(OrcProto.Type.Kind.DATE);
- } else if (cs instanceof TimestampColumnStatistics) {
- if (((TimestampColumnStatistics) cs).getMaximum() != null) {
- writer.key("min").value(((TimestampColumnStatistics) cs).getMinimum());
- writer.key("max").value(((TimestampColumnStatistics) cs).getMaximum());
- }
- writer.key("type").value(OrcProto.Type.Kind.TIMESTAMP);
- } else if (cs instanceof DecimalColumnStatistics) {
- if (((DecimalColumnStatistics) cs).getMaximum() != null) {
- writer.key("min").value(((DecimalColumnStatistics) cs).getMinimum());
- writer.key("max").value(((DecimalColumnStatistics) cs).getMaximum());
- writer.key("sum").value(((DecimalColumnStatistics) cs).getSum());
- }
- writer.key("type").value(OrcProto.Type.Kind.DECIMAL);
- }
- }
- }
-
- private static void writeBloomFilterIndexes(JSONWriter writer, int col,
- OrcProto.BloomFilterIndex[] bloomFilterIndex) throws JSONException {
-
- BloomFilterIO stripeLevelBF = null;
- if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
- int entryIx = 0;
- writer.key("bloomFilterIndexes").array();
- for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
- writer.object();
- writer.key("entryId").value(entryIx++);
- BloomFilterIO toMerge = new BloomFilterIO(bf);
- writeBloomFilterStats(writer, toMerge);
- if (stripeLevelBF == null) {
- stripeLevelBF = toMerge;
- } else {
- stripeLevelBF.merge(toMerge);
- }
- writer.endObject();
- }
- writer.endArray();
- }
- if (stripeLevelBF != null) {
- writer.key("stripeLevelBloomFilter");
- writer.object();
- writeBloomFilterStats(writer, stripeLevelBF);
- writer.endObject();
- }
- }
-
- private static void writeBloomFilterStats(JSONWriter writer, BloomFilterIO bf)
- throws JSONException {
- int bitCount = bf.getBitSize();
- int popCount = 0;
- for (long l : bf.getBitSet()) {
- popCount += Long.bitCount(l);
- }
- int k = bf.getNumHashFunctions();
- float loadFactor = (float) popCount / (float) bitCount;
- float expectedFpp = (float) Math.pow(loadFactor, k);
- writer.key("numHashFunctions").value(k);
- writer.key("bitCount").value(bitCount);
- writer.key("popCount").value(popCount);
- writer.key("loadFactor").value(loadFactor);
- writer.key("expectedFpp").value(expectedFpp);
- }
-
- private static void writeRowGroupIndexes(JSONWriter writer, int col,
- OrcProto.RowIndex[] rowGroupIndex)
- throws JSONException {
-
- OrcProto.RowIndex index;
- if (rowGroupIndex == null || (col >= rowGroupIndex.length) ||
- ((index = rowGroupIndex[col]) == null)) {
- return;
- }
-
- writer.key("rowGroupIndexes").array();
- for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) {
- writer.object();
- writer.key("entryId").value(entryIx);
- OrcProto.RowIndexEntry entry = index.getEntry(entryIx);
- if (entry == null) {
- continue;
- }
- OrcProto.ColumnStatistics colStats = entry.getStatistics();
- writeColumnStatistics(writer, ColumnStatisticsImpl.deserialize(colStats));
- writer.key("positions").array();
- for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
- writer.value(entry.getPositions(posIx));
- }
- writer.endArray();
- writer.endObject();
- }
- writer.endArray();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java
index 0dd58b7..b9094bf 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java
@@ -18,10 +18,7 @@
package org.apache.hadoop.hive.ql.io.orc;
import java.io.IOException;
-import java.util.ArrayDeque;
-import java.util.ArrayList;
import java.util.Arrays;
-import java.util.Deque;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
@@ -29,22 +26,20 @@ import java.util.TreeMap;
import org.apache.orc.OrcUtils;
import org.apache.orc.StripeInformation;
import org.apache.orc.TypeDescription;
+import org.apache.orc.impl.AcidStats;
+import org.apache.orc.impl.OrcAcidUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.ValidTxnList;
-import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.io.AcidInputFormat;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.RecordIdentifier;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
@@ -494,7 +489,7 @@ public class OrcRawRecordMerger implements AcidInputFormat.RawReader<OrcStruct>{
Path deltaFile = AcidUtils.createBucketFile(delta, bucket);
AcidUtils.ParsedDelta deltaDir = AcidUtils.parsedDelta(delta);
FileSystem fs = deltaFile.getFileSystem(conf);
- long length = getLastFlushLength(fs, deltaFile);
+ long length = OrcAcidUtils.getLastFlushLength(fs, deltaFile);
if (length != -1 && fs.exists(deltaFile)) {
Reader deltaReader = OrcFile.createReader(deltaFile,
OrcFile.readerOptions(conf).maxLength(length));
@@ -504,7 +499,7 @@ public class OrcRawRecordMerger implements AcidInputFormat.RawReader<OrcStruct>{
// it can produce wrong results (if the latest valid version of the record is filtered out by
// the sarg) or ArrayOutOfBounds errors (when the sarg is applied to a delete record)
// unless the delta only has insert events
- OrcRecordUpdater.AcidStats acidStats = OrcRecordUpdater.parseAcidStats(deltaReader);
+ AcidStats acidStats = OrcAcidUtils.parseAcidStats(deltaReader);
if(acidStats.deletes > 0 || acidStats.updates > 0) {
deltaEventOptions = eventOptions.clone().searchArgument(null, null);
}
@@ -536,28 +531,6 @@ public class OrcRawRecordMerger implements AcidInputFormat.RawReader<OrcStruct>{
}
}
- /**
- * Read the side file to get the last flush length.
- * @param fs the file system to use
- * @param deltaFile the path of the delta file
- * @return the maximum size of the file to use
- * @throws IOException
- */
- static long getLastFlushLength(FileSystem fs,
- Path deltaFile) throws IOException {
- Path lengths = OrcRecordUpdater.getSideFile(deltaFile);
- long result = Long.MAX_VALUE;
- try (FSDataInputStream stream = fs.open(lengths)) {
- result = -1;
- while (stream.available() > 0) {
- result = stream.readLong();
- }
- return result;
- } catch (IOException ioe) {
- return result;
- }
- }
-
@VisibleForTesting
RecordIdentifier getMinKey() {
return minKey;
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java
index d085c58..4bf2403 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java
@@ -25,6 +25,8 @@ import java.nio.charset.CharsetDecoder;
import java.util.ArrayList;
import java.util.List;
+import org.apache.orc.impl.AcidStats;
+import org.apache.orc.impl.OrcAcidUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -55,7 +57,6 @@ public class OrcRecordUpdater implements RecordUpdater {
public static final String ACID_KEY_INDEX_NAME = "hive.acid.key.index";
public static final String ACID_FORMAT = "_orc_acid_version";
- public static final String ACID_STATS = "hive.acid.stats";
public static final int ORC_ACID_VERSION = 0;
@@ -102,46 +103,6 @@ public class OrcRecordUpdater implements RecordUpdater {
private LongObjectInspector origTxnInspector; // OI for the original txn inside the record
// identifer
- static class AcidStats {
- long inserts;
- long updates;
- long deletes;
-
- AcidStats() {
- // nothing
- }
-
- AcidStats(String serialized) {
- String[] parts = serialized.split(",");
- inserts = Long.parseLong(parts[0]);
- updates = Long.parseLong(parts[1]);
- deletes = Long.parseLong(parts[2]);
- }
-
- String serialize() {
- StringBuilder builder = new StringBuilder();
- builder.append(inserts);
- builder.append(",");
- builder.append(updates);
- builder.append(",");
- builder.append(deletes);
- return builder.toString();
- }
-
- @Override
- public String toString() {
- StringBuilder builder = new StringBuilder();
- builder.append(" inserts: ").append(inserts);
- builder.append(" updates: ").append(updates);
- builder.append(" deletes: ").append(deletes);
- return builder.toString();
- }
- }
-
- public static Path getSideFile(Path main) {
- return new Path(main + AcidUtils.DELTA_SIDE_FILE_SUFFIX);
- }
-
static int getOperation(OrcStruct struct) {
return ((IntWritable) struct.getFieldValue(OPERATION)).get();
}
@@ -237,7 +198,7 @@ public class OrcRecordUpdater implements RecordUpdater {
}
if (options.getMinimumTransactionId() != options.getMaximumTransactionId()
&& !options.isWritingBase()){
- flushLengths = fs.create(getSideFile(this.path), true, 8,
+ flushLengths = fs.create(OrcAcidUtils.getSideFile(this.path), true, 8,
options.getReporter());
} else {
flushLengths = null;
@@ -297,7 +258,7 @@ public class OrcRecordUpdater implements RecordUpdater {
}
Reader reader = OrcFile.createReader(matchingBucket, OrcFile.readerOptions(options.getConfiguration()));
//no close() on Reader?!
- AcidStats acidStats = parseAcidStats(reader);
+ AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader);
if(acidStats.inserts > 0) {
return acidStats.inserts;
}
@@ -412,7 +373,7 @@ public class OrcRecordUpdater implements RecordUpdater {
}
if (flushLengths != null) {
flushLengths.close();
- fs.delete(getSideFile(path), false);
+ fs.delete(OrcAcidUtils.getSideFile(path), false);
}
writer = null;
}
@@ -456,26 +417,6 @@ public class OrcRecordUpdater implements RecordUpdater {
}
return result;
}
- /**
- * {@link KeyIndexBuilder} creates these
- */
- static AcidStats parseAcidStats(Reader reader) {
- if (reader.hasMetadataValue(OrcRecordUpdater.ACID_STATS)) {
- String statsSerialized;
- try {
- ByteBuffer val =
- reader.getMetadataValue(OrcRecordUpdater.ACID_STATS)
- .duplicate();
- statsSerialized = utf8Decoder.decode(val).toString();
- } catch (CharacterCodingException e) {
- throw new IllegalArgumentException("Bad string encoding for " +
- OrcRecordUpdater.ACID_STATS, e);
- }
- return new AcidStats(statsSerialized);
- } else {
- return null;
- }
- }
static class KeyIndexBuilder implements OrcFile.WriterCallback {
StringBuilder lastKey = new StringBuilder();
@@ -500,7 +441,7 @@ public class OrcRecordUpdater implements RecordUpdater {
) throws IOException {
context.getWriter().addUserMetadata(ACID_KEY_INDEX_NAME,
UTF8.encode(lastKey.toString()));
- context.getWriter().addUserMetadata(ACID_STATS,
+ context.getWriter().addUserMetadata(OrcAcidUtils.ACID_STATS,
UTF8.encode(acidStats.serialize()));
}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
index b7437be..3a2e7d8 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
@@ -22,17 +22,9 @@ import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
import java.util.List;
-import java.util.Set;
-import com.google.common.collect.Lists;
-import org.apache.orc.OrcUtils;
-import org.apache.orc.TypeDescription;
import org.apache.orc.impl.BufferChunk;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.impl.ColumnStatisticsImpl;
import org.apache.orc.CompressionCodec;
import org.apache.orc.FileMetaInfo;
import org.apache.orc.FileMetadata;
@@ -41,47 +33,25 @@ import org.apache.orc.StripeInformation;
import org.apache.orc.StripeStatistics;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.io.DiskRange;
-import org.apache.hadoop.hive.ql.io.FileFormatException;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
-import org.apache.hadoop.hive.ql.util.JavaDataModel;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.io.Text;
import org.apache.orc.OrcProto;
+import com.google.common.collect.Lists;
import com.google.protobuf.CodedInputStream;
-public class ReaderImpl implements Reader {
+public class ReaderImpl extends org.apache.orc.impl.ReaderImpl
+ implements Reader {
private static final Logger LOG = LoggerFactory.getLogger(ReaderImpl.class);
private static final int DIRECTORY_SIZE_GUESS = 16 * 1024;
- protected final FileSystem fileSystem;
- private final long maxLength;
- protected final Path path;
- protected final org.apache.orc.CompressionKind compressionKind;
- protected final CompressionCodec codec;
- protected final int bufferSize;
- private final List<OrcProto.StripeStatistics> stripeStats;
- private final int metadataSize;
- protected final List<OrcProto.Type> types;
- private final TypeDescription schema;
- private final List<OrcProto.UserMetadataItem> userMetadata;
- private final List<OrcProto.ColumnStatistics> fileStats;
- private final List<StripeInformation> stripes;
- protected final int rowIndexStride;
- private final long contentLength, numberOfRows;
-
private final ObjectInspector inspector;
- private long deserializedSize = -1;
- protected final Configuration conf;
- private final List<Integer> versionList;
- private final OrcFile.WriterVersion writerVersion;
//serialized footer - Keeping this around for use by getFileMetaInfo()
// will help avoid cpu cycles spend in deserializing at cost of increased
@@ -91,83 +61,9 @@ public class ReaderImpl implements Reader {
// This will only be set if the file footer/metadata was read from disk.
private final ByteBuffer footerMetaAndPsBuffer;
- public static class StripeInformationImpl
- implements StripeInformation {
- private final OrcProto.StripeInformation stripe;
-
- public StripeInformationImpl(OrcProto.StripeInformation stripe) {
- this.stripe = stripe;
- }
-
- @Override
- public long getOffset() {
- return stripe.getOffset();
- }
-
- @Override
- public long getLength() {
- return stripe.getDataLength() + getIndexLength() + getFooterLength();
- }
-
- @Override
- public long getDataLength() {
- return stripe.getDataLength();
- }
-
- @Override
- public long getFooterLength() {
- return stripe.getFooterLength();
- }
-
- @Override
- public long getIndexLength() {
- return stripe.getIndexLength();
- }
-
- @Override
- public long getNumberOfRows() {
- return stripe.getNumberOfRows();
- }
-
- @Override
- public String toString() {
- return "offset: " + getOffset() + " data: " + getDataLength() +
- " rows: " + getNumberOfRows() + " tail: " + getFooterLength() +
- " index: " + getIndexLength();
- }
- }
-
@Override
- public long getNumberOfRows() {
- return numberOfRows;
- }
-
- @Override
- public List<String> getMetadataKeys() {
- List<String> result = new ArrayList<String>();
- for(OrcProto.UserMetadataItem item: userMetadata) {
- result.add(item.getName());
- }
- return result;
- }
-
- @Override
- public ByteBuffer getMetadataValue(String key) {
- for(OrcProto.UserMetadataItem item: userMetadata) {
- if (item.hasName() && item.getName().equals(key)) {
- return item.getValue().asReadOnlyByteBuffer();
- }
- }
- throw new IllegalArgumentException("Can't find user metadata " + key);
- }
-
- public boolean hasMetadataValue(String key) {
- for(OrcProto.UserMetadataItem item: userMetadata) {
- if (item.hasName() && item.getName().equals(key)) {
- return true;
- }
- }
- return false;
+ public ObjectInspector getObjectInspector() {
+ return inspector;
}
@Override
@@ -181,181 +77,19 @@ public class ReaderImpl implements Reader {
compressionKind);
}
- @Override
- public org.apache.orc.CompressionKind getCompressionKind() {
- return compressionKind;
- }
-
- @Override
- public int getCompressionSize() {
- return bufferSize;
- }
-
- @Override
- public List<StripeInformation> getStripes() {
- return stripes;
- }
-
- @Override
- public ObjectInspector getObjectInspector() {
- return inspector;
- }
-
- @Override
- public long getContentLength() {
- return contentLength;
- }
-
- @Override
- public List<OrcProto.Type> getTypes() {
- return types;
- }
-
- @Override
- public OrcFile.Version getFileVersion() {
- for (OrcFile.Version version: OrcFile.Version.values()) {
- if ((versionList != null && !versionList.isEmpty()) &&
- version.getMajor() == versionList.get(0) &&
- version.getMinor() == versionList.get(1)) {
- return version;
- }
- }
- return OrcFile.Version.V_0_11;
- }
-
- @Override
- public OrcFile.WriterVersion getWriterVersion() {
- return writerVersion;
- }
-
- @Override
- public int getRowIndexStride() {
- return rowIndexStride;
- }
-
- @Override
- public ColumnStatistics[] getStatistics() {
- ColumnStatistics[] result = new ColumnStatistics[types.size()];
- for(int i=0; i < result.length; ++i) {
- result[i] = ColumnStatisticsImpl.deserialize(fileStats.get(i));
- }
- return result;
- }
-
- @Override
- public TypeDescription getSchema() {
- return schema;
- }
-
- /**
- * Ensure this is an ORC file to prevent users from trying to read text
- * files or RC files as ORC files.
- * @param in the file being read
- * @param path the filename for error messages
- * @param psLen the postscript length
- * @param buffer the tail of the file
- * @throws IOException
- */
- static void ensureOrcFooter(FSDataInputStream in,
- Path path,
- int psLen,
- ByteBuffer buffer) throws IOException {
- int magicLength = OrcFile.MAGIC.length();
- int fullLength = magicLength + 1;
- if (psLen < fullLength || buffer.remaining() < fullLength) {
- throw new FileFormatException("Malformed ORC file " + path +
- ". Invalid postscript length " + psLen);
- }
- int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - fullLength;
- byte[] array = buffer.array();
- // now look for the magic string at the end of the postscript.
- if (!Text.decode(array, offset, magicLength).equals(OrcFile.MAGIC)) {
- // If it isn't there, this may be the 0.11.0 version of ORC.
- // Read the first 3 bytes of the file to check for the header
- byte[] header = new byte[magicLength];
- in.readFully(0, header, 0, magicLength);
- // if it isn't there, this isn't an ORC file
- if (!Text.decode(header, 0 , magicLength).equals(OrcFile.MAGIC)) {
- throw new FileFormatException("Malformed ORC file " + path +
- ". Invalid postscript.");
- }
- }
- }
-
- /**
- * Build a version string out of an array.
- * @param version the version number as a list
- * @return the human readable form of the version string
- */
- private static String versionString(List<Integer> version) {
- StringBuilder buffer = new StringBuilder();
- for(int i=0; i < version.size(); ++i) {
- if (i != 0) {
- buffer.append('.');
- }
- buffer.append(version.get(i));
- }
- return buffer.toString();
- }
-
- /**
- * Check to see if this ORC file is from a future version and if so,
- * warn the user that we may not be able to read all of the column encodings.
- * @param log the logger to write any error message to
- * @param path the data source path for error messages
- * @param version the version of hive that wrote the file.
- */
- static void checkOrcVersion(Logger log, Path path, List<Integer> version) {
- if (version.size() >= 1) {
- int major = version.get(0);
- int minor = 0;
- if (version.size() >= 2) {
- minor = version.get(1);
- }
- if (major > OrcFile.Version.CURRENT.getMajor() ||
- (major == OrcFile.Version.CURRENT.getMajor() &&
- minor > OrcFile.Version.CURRENT.getMinor())) {
- log.warn(path + " was written by a future Hive version " +
- versionString(version) +
- ". This file may not be readable by this version of Hive.");
- }
- }
- }
-
/**
* Constructor that let's the user specify additional options.
* @param path pathname for file
* @param options options for reading
* @throws IOException
*/
- public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException {
- FileSystem fs = options.getFilesystem();
- if (fs == null) {
- fs = path.getFileSystem(options.getConfiguration());
- }
- this.fileSystem = fs;
- this.path = path;
- this.conf = options.getConfiguration();
- this.maxLength = options.getMaxLength();
-
+ public ReaderImpl(Path path,
+ OrcFile.ReaderOptions options) throws IOException {
+ super(path, options);
FileMetadata fileMetadata = options.getFileMetadata();
if (fileMetadata != null) {
- this.compressionKind = fileMetadata.getCompressionKind();
- this.bufferSize = fileMetadata.getCompressionBufferSize();
- this.codec = WriterImpl.createCodec(compressionKind);
- this.metadataSize = fileMetadata.getMetadataSize();
- this.stripeStats = fileMetadata.getStripeStats();
- this.versionList = fileMetadata.getVersionList();
- this.writerVersion = OrcFile.WriterVersion.from(fileMetadata.getWriterVersionNum());
- this.types = fileMetadata.getTypes();
- this.rowIndexStride = fileMetadata.getRowIndexStride();
- this.contentLength = fileMetadata.getContentLength();
- this.numberOfRows = fileMetadata.getNumberOfRows();
- this.fileStats = fileMetadata.getFileStats();
- this.stripes = fileMetadata.getStripes();
this.inspector = OrcStruct.createObjectInspector(0, fileMetadata.getTypes());
this.footerByteBuffer = null; // not cached and not needed here
- this.userMetadata = null; // not cached and not needed here
this.footerMetaAndPsBuffer = null;
} else {
FileMetaInfo footerMetaData;
@@ -363,7 +97,7 @@ public class ReaderImpl implements Reader {
footerMetaData = options.getFileMetaInfo();
this.footerMetaAndPsBuffer = null;
} else {
- footerMetaData = extractMetaInfoFromFooter(fs, path,
+ footerMetaData = extractMetaInfoFromFooter(fileSystem, path,
options.getMaxLength());
this.footerMetaAndPsBuffer = footerMetaData.footerMetaAndPsBuffer;
}
@@ -374,37 +108,8 @@ public class ReaderImpl implements Reader {
footerMetaData.footerBuffer
);
this.footerByteBuffer = footerMetaData.footerBuffer;
- this.compressionKind = rInfo.compressionKind;
- this.codec = rInfo.codec;
- this.bufferSize = rInfo.bufferSize;
- this.metadataSize = rInfo.metadataSize;
- this.stripeStats = rInfo.metadata.getStripeStatsList();
- this.types = rInfo.footer.getTypesList();
- this.rowIndexStride = rInfo.footer.getRowIndexStride();
- this.contentLength = rInfo.footer.getContentLength();
- this.numberOfRows = rInfo.footer.getNumberOfRows();
- this.userMetadata = rInfo.footer.getMetadataList();
- this.fileStats = rInfo.footer.getStatisticsList();
this.inspector = rInfo.inspector;
- this.versionList = footerMetaData.versionList;
- this.writerVersion = footerMetaData.writerVersion;
- this.stripes = convertProtoStripesToStripes(rInfo.footer.getStripesList());
}
- this.schema = OrcUtils.convertTypeFromProtobuf(this.types, 0);
- }
-
- /**
- * Get the WriterVersion based on the ORC file postscript.
- * @param writerVersion the integer writer version
- * @return the writer version of the file
- */
- static OrcFile.WriterVersion getWriterVersion(int writerVersion) {
- for(OrcFile.WriterVersion version: OrcFile.WriterVersion.values()) {
- if (version.getId() == writerVersion) {
- return version;
- }
- }
- return OrcFile.WriterVersion.FUTURE;
}
/** Extracts the necessary metadata from an externally store buffer (fullFooterBuffer). */
@@ -565,20 +270,6 @@ public class ReaderImpl implements Reader {
);
}
- private static OrcFile.WriterVersion extractWriterVersion(OrcProto.PostScript ps) {
- return (ps.hasWriterVersion()
- ? getWriterVersion(ps.getWriterVersion()) : OrcFile.WriterVersion.ORIGINAL);
- }
-
- private static List<StripeInformation> convertProtoStripesToStripes(
- List<OrcProto.StripeInformation> stripes) {
- List<StripeInformation> result = new ArrayList<StripeInformation>(stripes.size());
- for (OrcProto.StripeInformation info : stripes) {
- result.add(new StripeInformationImpl(info));
- }
- return result;
- }
-
/**
* MetaInfoObjExtractor - has logic to create the values for the fields in ReaderImpl
* from serialized fields.
@@ -617,7 +308,8 @@ public class ReaderImpl implements Reader {
public FileMetaInfo getFileMetaInfo() {
return new FileMetaInfo(compressionKind.toString(), bufferSize,
- metadataSize, footerByteBuffer, versionList, writerVersion, footerMetaAndPsBuffer);
+ getMetadataSize(), footerByteBuffer, getVersionList(),
+ getWriterVersion(), footerMetaAndPsBuffer);
}
/** Same as FileMetaInfo, but with extra fields. FileMetaInfo is serialized for splits
@@ -697,184 +389,7 @@ public class ReaderImpl implements Reader {
}
@Override
- public long getRawDataSize() {
- // if the deserializedSize is not computed, then compute it, else
- // return the already computed size. since we are reading from the footer
- // we don't have to compute deserialized size repeatedly
- if (deserializedSize == -1) {
- List<Integer> indices = Lists.newArrayList();
- for (int i = 0; i < fileStats.size(); ++i) {
- indices.add(i);
- }
- deserializedSize = getRawDataSizeFromColIndices(indices);
- }
- return deserializedSize;
- }
-
- @Override
- public long getRawDataSizeFromColIndices(List<Integer> colIndices) {
- return getRawDataSizeFromColIndices(colIndices, types, fileStats);
- }
-
- public static long getRawDataSizeFromColIndices(
- List<Integer> colIndices, List<OrcProto.Type> types,
- List<OrcProto.ColumnStatistics> stats) {
- long result = 0;
- for (int colIdx : colIndices) {
- result += getRawDataSizeOfColumn(colIdx, types, stats);
- }
- return result;
- }
-
- private static long getRawDataSizeOfColumn(int colIdx, List<OrcProto.Type> types,
- List<OrcProto.ColumnStatistics> stats) {
- OrcProto.ColumnStatistics colStat = stats.get(colIdx);
- long numVals = colStat.getNumberOfValues();
- OrcProto.Type type = types.get(colIdx);
-
- switch (type.getKind()) {
- case BINARY:
- // old orc format doesn't support binary statistics. checking for binary
- // statistics is not required as protocol buffers takes care of it.
- return colStat.getBinaryStatistics().getSum();
- case STRING:
- case CHAR:
- case VARCHAR:
- // old orc format doesn't support sum for string statistics. checking for
- // existence is not required as protocol buffers takes care of it.
-
- // ORC strings are deserialized to java strings. so use java data model's
- // string size
- numVals = numVals == 0 ? 1 : numVals;
- int avgStrLen = (int) (colStat.getStringStatistics().getSum() / numVals);
- return numVals * JavaDataModel.get().lengthForStringOfLength(avgStrLen);
- case TIMESTAMP:
- return numVals * JavaDataModel.get().lengthOfTimestamp();
- case DATE:
- return numVals * JavaDataModel.get().lengthOfDate();
- case DECIMAL:
- return numVals * JavaDataModel.get().lengthOfDecimal();
- case DOUBLE:
- case LONG:
- return numVals * JavaDataModel.get().primitive2();
- case FLOAT:
- case INT:
- case SHORT:
- case BOOLEAN:
- case BYTE:
- return numVals * JavaDataModel.get().primitive1();
- default:
- LOG.debug("Unknown primitive category: " + type.getKind());
- break;
- }
-
- return 0;
- }
-
- @Override
- public long getRawDataSizeOfColumns(List<String> colNames) {
- List<Integer> colIndices = getColumnIndicesFromNames(colNames);
- return getRawDataSizeFromColIndices(colIndices);
- }
-
- private List<Integer> getColumnIndicesFromNames(List<String> colNames) {
- // top level struct
- OrcProto.Type type = types.get(0);
- List<Integer> colIndices = Lists.newArrayList();
- List<String> fieldNames = type.getFieldNamesList();
- int fieldIdx = 0;
- for (String colName : colNames) {
- if (fieldNames.contains(colName)) {
- fieldIdx = fieldNames.indexOf(colName);
- } else {
- String s = "Cannot find field for: " + colName + " in ";
- for (String fn : fieldNames) {
- s += fn + ", ";
- }
- LOG.warn(s);
- continue;
- }
-
- // a single field may span multiple columns. find start and end column
- // index for the requested field
- int idxStart = type.getSubtypes(fieldIdx);
-
- int idxEnd;
-
- // if the specified is the last field and then end index will be last
- // column index
- if (fieldIdx + 1 > fieldNames.size() - 1) {
- idxEnd = getLastIdx() + 1;
- } else {
- idxEnd = type.getSubtypes(fieldIdx + 1);
- }
-
- // if start index and end index are same then the field is a primitive
- // field else complex field (like map, list, struct, union)
- if (idxStart == idxEnd) {
- // simple field
- colIndices.add(idxStart);
- } else {
- // complex fields spans multiple columns
- for (int i = idxStart; i < idxEnd; i++) {
- colIndices.add(i);
- }
- }
- }
- return colIndices;
- }
-
- private int getLastIdx() {
- Set<Integer> indices = new HashSet<>();
- for (OrcProto.Type type : types) {
- indices.addAll(type.getSubtypesList());
- }
- return Collections.max(indices);
- }
-
- @Override
- public List<OrcProto.StripeStatistics> getOrcProtoStripeStatistics() {
- return stripeStats;
- }
-
- @Override
- public List<OrcProto.ColumnStatistics> getOrcProtoFileStatistics() {
- return fileStats;
- }
-
- @Override
- public List<StripeStatistics> getStripeStatistics() {
- List<StripeStatistics> result = new ArrayList<>();
- for (OrcProto.StripeStatistics ss : stripeStats) {
- result.add(new StripeStatistics(ss.getColStatsList()));
- }
- return result;
- }
-
- public List<OrcProto.UserMetadataItem> getOrcProtoUserMetadata() {
- return userMetadata;
- }
-
- @Override
- public List<Integer> getVersionList() {
- return versionList;
- }
-
- @Override
- public int getMetadataSize() {
- return metadataSize;
- }
-
- @Override
public String toString() {
- StringBuilder buffer = new StringBuilder();
- buffer.append("ORC Reader(");
- buffer.append(path);
- if (maxLength != -1) {
- buffer.append(", ");
- buffer.append(maxLength);
- }
- buffer.append(")");
- return buffer.toString();
+ return "Hive " + super.toString();
}
}