You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by br...@apache.org on 2014/08/24 05:43:57 UTC
svn commit: r1620103 [12/27] - in /hive/branches/spark: ./ accumulo-handler/
common/src/java/org/apache/hadoop/hive/ant/
common/src/java/org/apache/hadoop/hive/common/type/
common/src/test/org/apache/hadoop/hive/common/type/ data/files/
hcatalog/stream...
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/AcidInputFormat.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/AcidInputFormat.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/AcidInputFormat.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/AcidInputFormat.java Sun Aug 24 03:43:48 2014
@@ -22,7 +22,7 @@ import org.apache.hadoop.conf.Configurat
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.ValidTxnList;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.RecordReader;
@@ -86,11 +86,20 @@ import java.io.IOException;
* <p>
* To support transitions between non-ACID layouts to ACID layouts, the input
* formats are expected to support both layouts and detect the correct one.
- *
- * @param <V> The row type
+ * <p>
+ * A note on the KEY of this InputFormat.
+ * For row-at-a-time processing, KEY can conveniently pass RowId into the operator
+ * pipeline. For vectorized execution the KEY could perhaps represent a range in the batch.
+ * Since {@link org.apache.hadoop.hive.ql.io.orc.OrcInputFormat} is declared to return
+ * {@code NullWritable} key, {@link org.apache.hadoop.hive.ql.io.AcidRecordReader} is defined
+ * to provide access to the RowId. Other implementations of AcidInputFormat can use either
+ * mechanism.
+ * </p>
+ *
+ * @param <VALUE> The row type
*/
-public interface AcidInputFormat<V>
- extends InputFormat<NullWritable, V>, InputFormatChecker {
+public interface AcidInputFormat<KEY extends WritableComparable, VALUE>
+ extends InputFormat<KEY, VALUE>, InputFormatChecker {
/**
* Options for controlling the record readers.
@@ -140,7 +149,7 @@ public interface AcidInputFormat<V>
* @return a record reader
* @throws IOException
*/
- public RowReader<V> getReader(InputSplit split,
+ public RowReader<VALUE> getReader(InputSplit split,
Options options) throws IOException;
public static interface RawReader<V>
@@ -162,11 +171,18 @@ public interface AcidInputFormat<V>
* @return a record reader
* @throws IOException
*/
- RawReader<V> getRawReader(Configuration conf,
+ RawReader<VALUE> getRawReader(Configuration conf,
boolean collapseEvents,
int bucket,
ValidTxnList validTxnList,
Path baseDirectory,
Path[] deltaDirectory
) throws IOException;
+
+ /**
+ * RecordReader returned by AcidInputFormat working in row-at-a-time mode should AcidRecordReader.
+ */
+ public interface AcidRecordReader<K,V> extends RecordReader<K,V> {
+ RecordIdentifier getRecordIdentifier();
+ }
}
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/AcidOutputFormat.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/AcidOutputFormat.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/AcidOutputFormat.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/AcidOutputFormat.java Sun Aug 24 03:43:48 2014
@@ -23,7 +23,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.Reporter;
import java.io.IOException;
@@ -34,7 +34,7 @@ import java.util.Properties;
* An extension for OutputFormats that want to implement ACID transactions.
* @param <V> the row type of the file
*/
-public interface AcidOutputFormat<V> extends HiveOutputFormat<NullWritable, V> {
+public interface AcidOutputFormat<K extends WritableComparable, V> extends HiveOutputFormat<K, V> {
/**
* Options to control how the files are written
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/HiveContextAwareRecordReader.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/HiveContextAwareRecordReader.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/HiveContextAwareRecordReader.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/HiveContextAwareRecordReader.java Sun Aug 24 03:43:48 2014
@@ -20,17 +20,13 @@ package org.apache.hadoop.hive.ql.io;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.LinkedList;
import java.util.List;
import java.util.Map;
-import java.util.Properties;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.conf.HiveConf;
-import org.apache.hadoop.hive.common.ObjectPair;
import org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.FooterBuffer;
@@ -42,16 +38,13 @@ import org.apache.hadoop.hive.ql.udf.gen
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan;
-import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
-import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
-import org.apache.hadoop.util.ReflectionUtils;
/** This class prepares an IOContext, and provides the ability to perform a binary search on the
* data. The binary search can be used by setting the value of inputFormatSorted in the
@@ -119,7 +112,18 @@ public abstract class HiveContextAwareRe
}
updateIOContext();
try {
- return doNext(key, value);
+ boolean retVal = doNext(key, value);
+ if(retVal) {
+ if(key instanceof RecordIdentifier) {
+ //supports AcidInputFormat which uses the KEY pass ROW__ID info
+ ioCxtRef.ri = (RecordIdentifier)key;
+ }
+ else if(recordReader instanceof AcidInputFormat.AcidRecordReader) {
+ //supports AcidInputFormat which do not use the KEY pass ROW__ID info
+ ioCxtRef.ri = ((AcidInputFormat.AcidRecordReader) recordReader).getRecordIdentifier();
+ }
+ }
+ return retVal;
} catch (IOException e) {
ioCxtRef.setIOExceptions(true);
throw e;
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/IOContext.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/IOContext.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/IOContext.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/IOContext.java Sun Aug 24 03:43:48 2014
@@ -61,6 +61,10 @@ public class IOContext {
Comparison comparison = null;
// The class name of the generic UDF being used by the filter
String genericUDFClassName = null;
+ /**
+ * supports {@link org.apache.hadoop.hive.ql.metadata.VirtualColumn#ROWID}
+ */
+ public RecordIdentifier ri;
public static enum Comparison {
GREATER,
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/RecordIdentifier.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/RecordIdentifier.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/RecordIdentifier.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/RecordIdentifier.java Sun Aug 24 03:43:48 2014
@@ -19,16 +19,81 @@
package org.apache.hadoop.hive.ql.io;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
/**
- * Gives the Record identifer information for the current record.
+ * Gives the Record identifier information for the current record.
*/
public class RecordIdentifier implements WritableComparable<RecordIdentifier> {
+ /**
+ * This is in support of {@link org.apache.hadoop.hive.ql.metadata.VirtualColumn#ROWID}
+ * Contains metadata about each field in RecordIdentifier that needs to be part of ROWID
+ * which is represented as a struct {@link org.apache.hadoop.hive.ql.io.RecordIdentifier.StructInfo}.
+ * Each field of RecordIdentifier which should be part of ROWID should be in this enum... which
+ * really means that it should be part of VirtualColumn (so make a subclass for rowid).
+ */
+ public static enum Field {
+ //note the enum names match field names in the struct
+ transactionId(TypeInfoFactory.longTypeInfo,
+ PrimitiveObjectInspectorFactory.javaLongObjectInspector),
+ bucketId(TypeInfoFactory.intTypeInfo, PrimitiveObjectInspectorFactory.javaIntObjectInspector),
+ rowId(TypeInfoFactory.longTypeInfo, PrimitiveObjectInspectorFactory.javaLongObjectInspector);
+ public final TypeInfo fieldType;
+ public final ObjectInspector fieldOI;
+ Field(TypeInfo fieldType, ObjectInspector fieldOI) {
+ this.fieldType = fieldType;
+ this.fieldOI = fieldOI;
+ }
+ }
+ /**
+ * RecordIdentifier is passed along the operator tree as a struct. This class contains a few
+ * utilities for that.
+ */
+ public static final class StructInfo {
+ private static final List<String> fieldNames = new ArrayList<String>(Field.values().length);
+ private static final List<TypeInfo> fieldTypes = new ArrayList<TypeInfo>(fieldNames.size());
+ private static final List<ObjectInspector> fieldOis =
+ new ArrayList<ObjectInspector>(fieldNames.size());
+ static {
+ for(Field f : Field.values()) {
+ fieldNames.add(f.name());
+ fieldTypes.add(f.fieldType);
+ fieldOis.add(f.fieldOI);
+ }
+ }
+ public static final TypeInfo typeInfo =
+ TypeInfoFactory.getStructTypeInfo(fieldNames, fieldTypes);
+ public static final ObjectInspector oi =
+ ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOis);
+
+ /**
+ * Copies relevant fields from {@code ri} to {@code struct}
+ * @param ri
+ * @param struct must be of size Field.values().size()
+ */
+ public static void toArray(RecordIdentifier ri, Object[] struct) {
+ assert struct != null && struct.length == Field.values().length;
+ if(ri == null) {
+ Arrays.fill(struct, null);
+ return;
+ }
+ struct[Field.transactionId.ordinal()] = ri.getTransactionId();
+ struct[Field.bucketId.ordinal()] = ri.getBucketId();
+ struct[Field.rowId.ordinal()] = ri.getRowId();
+ }
+ }
+
private long transactionId;
private int bucketId;
private long rowId;
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/avro/AvroGenericRecordReader.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/avro/AvroGenericRecordReader.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/avro/AvroGenericRecordReader.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/avro/AvroGenericRecordReader.java Sun Aug 24 03:43:48 2014
@@ -127,7 +127,7 @@ public class AvroGenericRecordReader imp
String s = job.get(AvroSerdeUtils.AVRO_SERDE_SCHEMA);
if(s != null) {
LOG.info("Found the avro schema in the job: " + s);
- return Schema.parse(s);
+ return AvroSerdeUtils.getSchemaFor(s);
}
// No more places to get the schema from. Give up. May have to re-encode later.
return null;
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java Sun Aug 24 03:43:48 2014
@@ -98,7 +98,7 @@ import com.google.common.util.concurrent
*/
public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
InputFormatChecker, VectorizedInputFormatInterface,
- AcidInputFormat<OrcStruct> {
+ AcidInputFormat<NullWritable, OrcStruct> {
private static final Log LOG = LogFactory.getLog(OrcInputFormat.class);
static final HadoopShims SHIMS = ShimLoader.getHadoopShims();
@@ -989,7 +989,7 @@ public class OrcInputFormat implements
boolean vectorMode = Utilities.isVectorMode(conf);
// if HiveCombineInputFormat gives us FileSplits instead of OrcSplits,
- // we know it is not ACID.
+ // we know it is not ACID. (see a check in CombineHiveInputFormat.getSplits() that assures this)
if (inputSplit.getClass() == FileSplit.class) {
if (vectorMode) {
return createVectorizedReader(inputSplit, conf, reporter);
@@ -998,62 +998,75 @@ public class OrcInputFormat implements
((FileSplit) inputSplit).getPath(),
OrcFile.readerOptions(conf)), conf, (FileSplit) inputSplit);
}
-
+
OrcSplit split = (OrcSplit) inputSplit;
reporter.setStatus(inputSplit.toString());
- // if we are strictly old-school, just use the old code
+ Options options = new Options(conf).reporter(reporter);
+ final RowReader<OrcStruct> inner = getReader(inputSplit, options);
+
+
+ /*Even though there are no delta files, we still need to produce row ids so that an
+ * UPDATE or DELETE statement would work on a table which didn't have any previous updates*/
if (split.isOriginal() && split.getDeltas().isEmpty()) {
if (vectorMode) {
return createVectorizedReader(inputSplit, conf, reporter);
} else {
- return new OrcRecordReader(OrcFile.createReader(split.getPath(),
- OrcFile.readerOptions(conf)), conf, split);
+ return new NullKeyRecordReader(inner, conf);
}
}
- Options options = new Options(conf).reporter(reporter);
- final RowReader<OrcStruct> inner = getReader(inputSplit, options);
if (vectorMode) {
return (org.apache.hadoop.mapred.RecordReader)
new VectorizedOrcAcidRowReader(inner, conf, (FileSplit) inputSplit);
}
- final RecordIdentifier id = inner.createKey();
-
- // Return a RecordReader that is compatible with the Hive 0.12 reader
- // with NullWritable for the key instead of RecordIdentifier.
- return new org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct>(){
- @Override
- public boolean next(NullWritable nullWritable,
- OrcStruct orcStruct) throws IOException {
- return inner.next(id, orcStruct);
- }
+ return new NullKeyRecordReader(inner, conf);
+ }
+ /**
+ * Return a RecordReader that is compatible with the Hive 0.12 reader
+ * with NullWritable for the key instead of RecordIdentifier.
+ */
+ public static final class NullKeyRecordReader implements AcidRecordReader<NullWritable, OrcStruct> {
+ private final RecordIdentifier id;
+ private final RowReader<OrcStruct> inner;
+
+ public RecordIdentifier getRecordIdentifier() {
+ return id;
+ }
+ private NullKeyRecordReader(RowReader<OrcStruct> inner, Configuration conf) {
+ this.inner = inner;
+ id = inner.createKey();
+ }
+ @Override
+ public boolean next(NullWritable nullWritable,
+ OrcStruct orcStruct) throws IOException {
+ return inner.next(id, orcStruct);
+ }
- @Override
- public NullWritable createKey() {
- return NullWritable.get();
- }
+ @Override
+ public NullWritable createKey() {
+ return NullWritable.get();
+ }
- @Override
- public OrcStruct createValue() {
- return inner.createValue();
- }
+ @Override
+ public OrcStruct createValue() {
+ return inner.createValue();
+ }
- @Override
- public long getPos() throws IOException {
- return inner.getPos();
- }
+ @Override
+ public long getPos() throws IOException {
+ return inner.getPos();
+ }
- @Override
- public void close() throws IOException {
- inner.close();
- }
+ @Override
+ public void close() throws IOException {
+ inner.close();
+ }
- @Override
- public float getProgress() throws IOException {
- return inner.getProgress();
- }
- };
+ @Override
+ public float getProgress() throws IOException {
+ return inner.getProgress();
+ }
}
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java Sun Aug 24 03:43:48 2014
@@ -50,7 +50,7 @@ import java.util.Properties;
* A Hive OutputFormat for ORC files.
*/
public class OrcOutputFormat extends FileOutputFormat<NullWritable, OrcSerdeRow>
- implements AcidOutputFormat<OrcSerdeRow> {
+ implements AcidOutputFormat<NullWritable, OrcSerdeRow> {
private static class OrcRecordWriter
implements RecordWriter<NullWritable, OrcSerdeRow>,
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java Sun Aug 24 03:43:48 2014
@@ -88,6 +88,9 @@ public class OrcRecordUpdater implements
private final IntWritable bucket = new IntWritable();
private final LongWritable rowId = new LongWritable();
private long insertedRows = 0;
+ // This records how many rows have been inserted or deleted. It is separate from insertedRows
+ // because that is monotonically increasing to give new unique row ids.
+ private long rowCountDelta = 0;
private final KeyIndexBuilder indexBuilder = new KeyIndexBuilder();
static class AcidStats {
@@ -263,6 +266,7 @@ public class OrcRecordUpdater implements
}
addEvent(INSERT_OPERATION, currentTransaction, currentTransaction,
insertedRows++, row);
+ rowCountDelta++;
}
@Override
@@ -283,6 +287,7 @@ public class OrcRecordUpdater implements
}
addEvent(DELETE_OPERATION, currentTransaction, originalTransaction, rowId,
null);
+ rowCountDelta--;
}
@Override
@@ -317,7 +322,11 @@ public class OrcRecordUpdater implements
@Override
public SerDeStats getStats() {
- return null;
+ SerDeStats stats = new SerDeStats();
+ stats.setRowCount(rowCountDelta);
+ // Don't worry about setting raw data size diff. I have no idea how to calculate that
+ // without finding the row we are updating or deleting, which would be a mess.
+ return stats;
}
@VisibleForTesting
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java Sun Aug 24 03:43:48 2014
@@ -21,6 +21,7 @@ import static org.apache.hadoop.hive.con
import java.io.EOFException;
import java.io.IOException;
+import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.sql.Timestamp;
@@ -1292,8 +1293,9 @@ class RecordReaderImpl implements Record
BigInteger bInt = SerializationUtils.readBigInteger(valueStream);
result.vector[i].update(bInt, (short) scratchScaleVector.vector[i]);
- // Change the scale to match the schema if the scale in data is different.
- if (scale != scratchScaleVector.vector[i]) {
+ // Change the scale to match the schema if the scale is less than in data.
+ // (HIVE-7373) If scale is bigger, then it leaves the original trailing zeros
+ if (scale < scratchScaleVector.vector[i]) {
result.vector[i].changeScaleDestructive((short) scale);
}
}
@@ -2410,6 +2412,9 @@ class RecordReaderImpl implements Record
private static Object getBaseObjectForComparison(Object predObj, Object statsObj) {
if (predObj != null) {
+ if (predObj instanceof ExprNodeConstantDesc) {
+ predObj = ((ExprNodeConstantDesc) predObj).getValue();
+ }
// following are implicitly convertible
if (statsObj instanceof Long) {
if (predObj instanceof Double) {
@@ -2428,10 +2433,6 @@ class RecordReaderImpl implements Record
return Double.valueOf(predObj.toString());
}
} else if (statsObj instanceof String) {
- // Ex: where d = date '1970-02-01' will be ExprNodeConstantDesc
- if (predObj instanceof ExprNodeConstantDesc) {
- return ((ExprNodeConstantDesc) predObj).getValue().toString();
- }
return predObj.toString();
} else if (statsObj instanceof HiveDecimal) {
if (predObj instanceof Long) {
@@ -2440,6 +2441,8 @@ class RecordReaderImpl implements Record
return HiveDecimal.create(predObj.toString());
} else if (predObj instanceof String) {
return HiveDecimal.create(predObj.toString());
+ } else if (predObj instanceof BigDecimal) {
+ return HiveDecimal.create((BigDecimal)predObj);
}
}
}
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RunLengthIntegerWriterV2.java Sun Aug 24 03:43:48 2014
@@ -142,9 +142,9 @@ class RunLengthIntegerWriterV2 implement
private final boolean signed;
private EncodingType encoding;
private int numLiterals;
- private long[] zigzagLiterals;
- private long[] baseRedLiterals;
- private long[] adjDeltas;
+ private final long[] zigzagLiterals = new long[MAX_SCOPE];
+ private final long[] baseRedLiterals = new long[MAX_SCOPE];
+ private final long[] adjDeltas = new long[MAX_SCOPE];
private long fixedDelta;
private int zzBits90p;
private int zzBits100p;
@@ -252,8 +252,11 @@ class RunLengthIntegerWriterV2 implement
// store the first value as delta value using zigzag encoding
utils.writeVslong(output, adjDeltas[0]);
- // adjacent delta values are bit packed
- utils.writeInts(adjDeltas, 1, adjDeltas.length - 1, fb, output);
+ // adjacent delta values are bit packed. The length of adjDeltas array is
+ // always one less than the number of literals (delta difference for n
+ // elements is n-1). We have already written one element, write the
+ // remaining numLiterals - 2 elements here
+ utils.writeInts(adjDeltas, 1, numLiterals - 2, fb, output);
}
}
@@ -323,7 +326,7 @@ class RunLengthIntegerWriterV2 implement
// base reduced literals are bit packed
int closestFixedBits = utils.getClosestFixedBits(fb);
- utils.writeInts(baseRedLiterals, 0, baseRedLiterals.length, closestFixedBits,
+ utils.writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits,
output);
// write patch list
@@ -372,7 +375,7 @@ class RunLengthIntegerWriterV2 implement
output.write(headerSecondByte);
// bit packing the zigzag encoded literals
- utils.writeInts(zigzagLiterals, 0, zigzagLiterals.length, fb, output);
+ utils.writeInts(zigzagLiterals, 0, numLiterals, fb, output);
// reset run length
variableRunLength = 0;
@@ -414,14 +417,6 @@ class RunLengthIntegerWriterV2 implement
}
private void determineEncoding() {
- // used for direct encoding
- zigzagLiterals = new long[numLiterals];
-
- // used for patched base encoding
- baseRedLiterals = new long[numLiterals];
-
- // used for delta encoding
- adjDeltas = new long[numLiterals - 1];
int idx = 0;
@@ -530,10 +525,10 @@ class RunLengthIntegerWriterV2 implement
// is not significant then we can use direct or delta encoding
double p = 0.9;
- zzBits90p = utils.percentileBits(zigzagLiterals, p);
+ zzBits90p = utils.percentileBits(zigzagLiterals, 0, numLiterals, p);
p = 1.0;
- zzBits100p = utils.percentileBits(zigzagLiterals, p);
+ zzBits100p = utils.percentileBits(zigzagLiterals, 0, numLiterals, p);
int diffBitsLH = zzBits100p - zzBits90p;
@@ -543,18 +538,18 @@ class RunLengthIntegerWriterV2 implement
&& isFixedDelta == false) {
// patching is done only on base reduced values.
// remove base from literals
- for(int i = 0; i < zigzagLiterals.length; i++) {
+ for(int i = 0; i < numLiterals; i++) {
baseRedLiterals[i] = literals[i] - min;
}
// 95th percentile width is used to determine max allowed value
// after which patching will be done
p = 0.95;
- brBits95p = utils.percentileBits(baseRedLiterals, p);
+ brBits95p = utils.percentileBits(baseRedLiterals, 0, numLiterals, p);
// 100th percentile is used to compute the max patch width
p = 1.0;
- brBits100p = utils.percentileBits(baseRedLiterals, p);
+ brBits100p = utils.percentileBits(baseRedLiterals, 0, numLiterals, p);
// after base reducing the values, if the difference in bits between
// 95th percentile and 100th percentile value is zero then there
@@ -592,7 +587,7 @@ class RunLengthIntegerWriterV2 implement
// since we are considering only 95 percentile, the size of gap and
// patch array can contain only be 5% values
- patchLength = (int) Math.ceil((baseRedLiterals.length * 0.05));
+ patchLength = (int) Math.ceil((numLiterals * 0.05));
int[] gapList = new int[patchLength];
long[] patchList = new long[patchLength];
@@ -616,7 +611,7 @@ class RunLengthIntegerWriterV2 implement
int gap = 0;
int maxGap = 0;
- for(int i = 0; i < baseRedLiterals.length; i++) {
+ for(int i = 0; i < numLiterals; i++) {
// if value is above mask then create the patch and record the gap
if (baseRedLiterals[i] > mask) {
gap = i - prev;
@@ -694,9 +689,6 @@ class RunLengthIntegerWriterV2 implement
numLiterals = 0;
encoding = null;
prevDelta = 0;
- zigzagLiterals = null;
- baseRedLiterals = null;
- adjDeltas = null;
fixedDelta = 0;
zzBits90p = 0;
zzBits100p = 0;
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/SerializationUtils.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/SerializationUtils.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/SerializationUtils.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/orc/SerializationUtils.java Sun Aug 24 03:43:48 2014
@@ -244,7 +244,7 @@ final class SerializationUtils {
* @param p - percentile value (>=0.0 to <=1.0)
* @return pth percentile bits
*/
- int percentileBits(long[] data, double p) {
+ int percentileBits(long[] data, int offset, int length, double p) {
if ((p > 1.0) || (p <= 0.0)) {
return -1;
}
@@ -254,13 +254,12 @@ final class SerializationUtils {
int[] hist = new int[32];
// compute the histogram
- for(long l : data) {
- int idx = encodeBitWidth(findClosestNumBits(l));
+ for(int i = offset; i < (offset + length); i++) {
+ int idx = encodeBitWidth(findClosestNumBits(data[i]));
hist[idx] += 1;
}
- int len = data.length;
- int perLen = (int) (len * (1.0 - p));
+ int perLen = (int) (length * (1.0 - p));
// return the bits required by pth percentile length
for(int i = hist.length - 1; i >= 0; i--) {
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ProjectionPusher.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ProjectionPusher.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ProjectionPusher.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ProjectionPusher.java Sun Aug 24 03:43:48 2014
@@ -146,7 +146,7 @@ public class ProjectionPusher {
if ((part != null) && (part.getTableDesc() != null)) {
Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), cloneJobConf);
}
- pushProjectionsAndFilters(cloneJobConf, path.toString(), path.toUri().toString());
+ pushProjectionsAndFilters(cloneJobConf, path.toString(), path.toUri().getPath());
return cloneJobConf;
}
}
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ArrayWritableGroupConverter.java Sun Aug 24 03:43:48 2014
@@ -13,6 +13,9 @@
*/
package org.apache.hadoop.hive.ql.io.parquet.convert;
+import java.util.List;
+
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.Writable;
@@ -30,7 +33,7 @@ public class ArrayWritableGroupConverter
private Writable[] mapPairContainer;
public ArrayWritableGroupConverter(final GroupType groupType, final HiveGroupConverter parent,
- final int index) {
+ final int index, List<TypeInfo> hiveSchemaTypeInfos) {
this.parent = parent;
this.index = index;
int count = groupType.getFieldCount();
@@ -40,7 +43,8 @@ public class ArrayWritableGroupConverter
isMap = count == 2;
converters = new Converter[count];
for (int i = 0; i < count; i++) {
- converters[i] = getConverterFromDescription(groupType.getType(i), i, this);
+ converters[i] = getConverterFromDescription(groupType.getType(i), i, this,
+ hiveSchemaTypeInfos);
}
}
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableGroupConverter.java Sun Aug 24 03:43:48 2014
@@ -16,6 +16,7 @@ package org.apache.hadoop.hive.ql.io.par
import java.util.ArrayList;
import java.util.List;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.Writable;
@@ -36,19 +37,21 @@ public class DataWritableGroupConverter
private final Object[] currentArr;
private Writable[] rootMap;
- public DataWritableGroupConverter(final GroupType requestedSchema, final GroupType tableSchema) {
- this(requestedSchema, null, 0, tableSchema);
+ public DataWritableGroupConverter(final GroupType requestedSchema, final GroupType tableSchema,
+ final List<TypeInfo> hiveSchemaTypeInfos) {
+ this(requestedSchema, null, 0, tableSchema, hiveSchemaTypeInfos);
final int fieldCount = tableSchema.getFieldCount();
this.rootMap = new Writable[fieldCount];
}
public DataWritableGroupConverter(final GroupType groupType, final HiveGroupConverter parent,
- final int index) {
- this(groupType, parent, index, groupType);
+ final int index, final List<TypeInfo> hiveSchemaTypeInfos) {
+ this(groupType, parent, index, groupType, hiveSchemaTypeInfos);
}
public DataWritableGroupConverter(final GroupType selectedGroupType,
- final HiveGroupConverter parent, final int index, final GroupType containingGroupType) {
+ final HiveGroupConverter parent, final int index, final GroupType containingGroupType,
+ final List<TypeInfo> hiveSchemaTypeInfos) {
this.parent = parent;
this.index = index;
final int totalFieldCount = containingGroupType.getFieldCount();
@@ -62,7 +65,8 @@ public class DataWritableGroupConverter
Type subtype = selectedFields.get(i);
if (containingGroupType.getFields().contains(subtype)) {
converters[i] = getConverterFromDescription(subtype,
- containingGroupType.getFieldIndex(subtype.getName()), this);
+ containingGroupType.getFieldIndex(subtype.getName()), this,
+ hiveSchemaTypeInfos);
} else {
throw new IllegalStateException("Group type [" + containingGroupType +
"] does not contain requested field: " + subtype);
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/DataWritableRecordConverter.java Sun Aug 24 03:43:48 2014
@@ -13,6 +13,9 @@
*/
package org.apache.hadoop.hive.ql.io.parquet.convert;
+import java.util.List;
+
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.ArrayWritable;
import parquet.io.api.GroupConverter;
@@ -28,8 +31,10 @@ public class DataWritableRecordConverter
private final DataWritableGroupConverter root;
- public DataWritableRecordConverter(final GroupType requestedSchema, final GroupType tableSchema) {
- this.root = new DataWritableGroupConverter(requestedSchema, tableSchema);
+ public DataWritableRecordConverter(final GroupType requestedSchema, final GroupType tableSchema,
+ final List<TypeInfo> hiveColumnTypeInfos) {
+ this.root = new DataWritableGroupConverter(requestedSchema, tableSchema,
+ hiveColumnTypeInfos);
}
@Override
@@ -41,4 +46,4 @@ public class DataWritableRecordConverter
public GroupConverter getRootConverter() {
return root;
}
-}
+}
\ No newline at end of file
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java Sun Aug 24 03:43:48 2014
@@ -16,12 +16,19 @@ package org.apache.hadoop.hive.ql.io.par
import java.math.BigDecimal;
import java.sql.Timestamp;
import java.util.ArrayList;
+import java.util.List;
+import org.apache.hadoop.hive.common.type.HiveChar;
+import org.apache.hadoop.hive.common.type.HiveVarchar;
import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTime;
import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils;
+import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.FloatWritable;
@@ -145,6 +152,32 @@ public enum ETypeConverter {
}
};
}
+ },
+ ECHAR_CONVERTER(HiveCharWritable.class) {
+ @Override
+ Converter getConverter(final PrimitiveType type, final int index, final HiveGroupConverter parent) {
+ return new BinaryConverter<HiveCharWritable>(type, parent, index) {
+ @Override
+ protected HiveCharWritable convert(Binary binary) {
+ HiveChar hiveChar = new HiveChar();
+ hiveChar.setValue(binary.toStringUsingUTF8());
+ return new HiveCharWritable(hiveChar);
+ }
+ };
+ }
+ },
+ EVARCHAR_CONVERTER(HiveVarcharWritable.class) {
+ @Override
+ Converter getConverter(final PrimitiveType type, final int index, final HiveGroupConverter parent) {
+ return new BinaryConverter<HiveVarcharWritable>(type, parent, index) {
+ @Override
+ protected HiveVarcharWritable convert(Binary binary) {
+ HiveVarchar hiveVarchar = new HiveVarchar();
+ hiveVarchar.setValue(binary.toStringUsingUTF8());
+ return new HiveVarcharWritable(hiveVarchar);
+ }
+ };
+ }
};
final Class<?> _type;
@@ -159,7 +192,8 @@ public enum ETypeConverter {
abstract Converter getConverter(final PrimitiveType type, final int index, final HiveGroupConverter parent);
- public static Converter getNewConverter(final PrimitiveType type, final int index, final HiveGroupConverter parent) {
+ public static Converter getNewConverter(final PrimitiveType type, final int index,
+ final HiveGroupConverter parent, List<TypeInfo> hiveSchemaTypeInfos) {
if (type.isPrimitive() && (type.asPrimitiveType().getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT96))) {
//TODO- cleanup once parquet support Timestamp type annotation.
return ETypeConverter.ETIMESTAMP_CONVERTER.getConverter(type, index, parent);
@@ -167,7 +201,15 @@ public enum ETypeConverter {
if (OriginalType.DECIMAL == type.getOriginalType()) {
return EDECIMAL_CONVERTER.getConverter(type, index, parent);
} else if (OriginalType.UTF8 == type.getOriginalType()) {
- return ESTRING_CONVERTER.getConverter(type, index, parent);
+ if (hiveSchemaTypeInfos.get(index).getTypeName()
+ .startsWith(serdeConstants.CHAR_TYPE_NAME)) {
+ return ECHAR_CONVERTER.getConverter(type, index, parent);
+ } else if (hiveSchemaTypeInfos.get(index).getTypeName()
+ .startsWith(serdeConstants.VARCHAR_TYPE_NAME)) {
+ return EVARCHAR_CONVERTER.getConverter(type, index, parent);
+ } else if (type.isPrimitive()) {
+ return ESTRING_CONVERTER.getConverter(type, index, parent);
+ }
}
Class<?> javaType = type.getPrimitiveTypeName().javaType;
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveGroupConverter.java Sun Aug 24 03:43:48 2014
@@ -13,6 +13,9 @@
*/
package org.apache.hadoop.hive.ql.io.parquet.convert;
+import java.util.List;
+
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.Writable;
import parquet.io.api.Converter;
@@ -23,17 +26,20 @@ import parquet.schema.Type.Repetition;
public abstract class HiveGroupConverter extends GroupConverter {
protected static Converter getConverterFromDescription(final Type type, final int index,
- final HiveGroupConverter parent) {
+ final HiveGroupConverter parent, List<TypeInfo> hiveSchemaTypeInfos) {
if (type == null) {
return null;
}
if (type.isPrimitive()) {
- return ETypeConverter.getNewConverter(type.asPrimitiveType(), index, parent);
+ return ETypeConverter.getNewConverter(type.asPrimitiveType(), index, parent,
+ hiveSchemaTypeInfos);
} else {
if (type.asGroupType().getRepetition() == Repetition.REPEATED) {
- return new ArrayWritableGroupConverter(type.asGroupType(), parent, index);
+ return new ArrayWritableGroupConverter(type.asGroupType(), parent, index,
+ hiveSchemaTypeInfos);
} else {
- return new DataWritableGroupConverter(type.asGroupType(), parent, index);
+ return new DataWritableGroupConverter(type.asGroupType(), parent, index,
+ hiveSchemaTypeInfos);
}
}
}
@@ -42,4 +48,4 @@ public abstract class HiveGroupConverter
protected abstract void add(int index, Writable value);
-}
+}
\ No newline at end of file
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/HiveSchemaConverter.java Sun Aug 24 03:43:48 2014
@@ -16,6 +16,7 @@ package org.apache.hadoop.hive.ql.io.par
import java.util.List;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
+import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
@@ -25,7 +26,6 @@ import org.apache.hadoop.hive.serde2.typ
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import parquet.schema.ConversionPatterns;
-import parquet.schema.DecimalMetadata;
import parquet.schema.GroupType;
import parquet.schema.MessageType;
import parquet.schema.OriginalType;
@@ -81,6 +81,14 @@ public class HiveSchemaConverter {
return new PrimitiveType(repetition, PrimitiveTypeName.INT96, name);
} else if (typeInfo.equals(TypeInfoFactory.voidTypeInfo)) {
throw new UnsupportedOperationException("Void type not implemented");
+ } else if (typeInfo.getTypeName().toLowerCase().startsWith(
+ serdeConstants.CHAR_TYPE_NAME)) {
+ return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8)
+ .named(name);
+ } else if (typeInfo.getTypeName().toLowerCase().startsWith(
+ serdeConstants.VARCHAR_TYPE_NAME)) {
+ return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8)
+ .named(name);
} else if (typeInfo instanceof DecimalTypeInfo) {
DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo;
int prec = decimalTypeInfo.precision();
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java Sun Aug 24 03:43:48 2014
@@ -14,6 +14,7 @@
package org.apache.hadoop.hive.ql.io.parquet.read;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -23,6 +24,8 @@ import org.apache.hadoop.hive.ql.io.IOCo
import org.apache.hadoop.hive.ql.io.parquet.convert.DataWritableRecordConverter;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.util.StringUtils;
@@ -60,6 +63,28 @@ public class DataWritableReadSupport ext
return (List<String>) VirtualColumn.
removeVirtualColumns(StringUtils.getStringCollection(columns));
}
+
+ private static List<TypeInfo> getColumnTypes(Configuration configuration) {
+
+ List<String> columnNames;
+ String columnNamesProperty = configuration.get(IOConstants.COLUMNS);
+ if (columnNamesProperty.length() == 0) {
+ columnNames = new ArrayList<String>();
+ } else {
+ columnNames = Arrays.asList(columnNamesProperty.split(","));
+ }
+ List<TypeInfo> columnTypes;
+ String columnTypesProperty = configuration.get(IOConstants.COLUMNS_TYPES);
+ if (columnTypesProperty.length() == 0) {
+ columnTypes = new ArrayList<TypeInfo>();
+ } else {
+ columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypesProperty);
+ }
+
+ columnTypes = VirtualColumn.removeVirtualColumnTypes(columnNames, columnTypes);
+ return columnTypes;
+ }
+
/**
*
* It creates the readContext for Parquet side with the requested schema during the init phase.
@@ -100,20 +125,22 @@ public class DataWritableReadSupport ext
final List<Type> typeListWanted = new ArrayList<Type>();
final boolean indexAccess = configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false);
for (final Integer idx : indexColumnsWanted) {
- String col = listColumns.get(idx);
- if (indexAccess) {
- typeListWanted.add(tableSchema.getType(col));
- } else {
- col = col.toLowerCase();
- if (lowerCaseFileSchemaColumns.containsKey(col)) {
- typeListWanted.add(tableSchema.getType(lowerCaseFileSchemaColumns.get(col)));
+ if (idx < listColumns.size()) {
+ String col = listColumns.get(idx);
+ if (indexAccess) {
+ typeListWanted.add(tableSchema.getType(col));
} else {
- // should never occur?
- String msg = "Column " + col + " at index " + idx + " does not exist in " +
+ col = col.toLowerCase();
+ if (lowerCaseFileSchemaColumns.containsKey(col)) {
+ typeListWanted.add(tableSchema.getType(lowerCaseFileSchemaColumns.get(col)));
+ } else {
+ // should never occur?
+ String msg = "Column " + col + " at index " + idx + " does not exist in " +
lowerCaseFileSchemaColumns;
- throw new IllegalStateException(msg);
+ throw new IllegalStateException(msg);
+ }
}
- }
+ }
}
requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(),
typeListWanted), fileSchema, configuration);
@@ -146,7 +173,8 @@ public class DataWritableReadSupport ext
}
final MessageType tableSchema = resolveSchemaAccess(MessageTypeParser.
parseMessageType(metadata.get(HIVE_SCHEMA_KEY)), fileSchema, configuration);
- return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema);
+ return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema,
+ getColumnTypes(configuration));
}
/**
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ArrayWritableObjectInspector.java Sun Aug 24 03:43:48 2014
@@ -25,12 +25,14 @@ import org.apache.hadoop.hive.serde2.obj
import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.ArrayWritable;
/**
@@ -102,12 +104,10 @@ public class ArrayWritableObjectInspecto
return PrimitiveObjectInspectorFactory.writableTimestampObjectInspector;
} else if (typeInfo.equals(TypeInfoFactory.dateTypeInfo)) {
throw new UnsupportedOperationException("Parquet does not support date. See HIVE-6384");
- } else if (typeInfo.getTypeName().toLowerCase().startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
- throw new UnsupportedOperationException("Parquet does not support decimal. See HIVE-6384");
} else if (typeInfo.getTypeName().toLowerCase().startsWith(serdeConstants.CHAR_TYPE_NAME)) {
- throw new UnsupportedOperationException("Parquet does not support char. See HIVE-6384");
+ return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector((CharTypeInfo) typeInfo);
} else if (typeInfo.getTypeName().toLowerCase().startsWith(serdeConstants.VARCHAR_TYPE_NAME)) {
- throw new UnsupportedOperationException("Parquet does not support varchar. See HIVE-6384");
+ return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector((VarcharTypeInfo) typeInfo);
} else {
throw new UnsupportedOperationException("Unknown field type: " + typeInfo);
}
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java Sun Aug 24 03:43:48 2014
@@ -42,6 +42,8 @@ import org.apache.hadoop.hive.serde2.obj
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
@@ -60,6 +62,7 @@ import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
+import parquet.io.api.Binary;
/**
*
@@ -280,6 +283,12 @@ public class ParquetHiveSerDe extends Ab
return new BytesWritable(tgt);
case TIMESTAMP:
return new TimestampWritable(((TimestampObjectInspector) inspector).getPrimitiveJavaObject(obj));
+ case CHAR:
+ String strippedValue = ((HiveCharObjectInspector) inspector).getPrimitiveJavaObject(obj).getStrippedValue();
+ return new BytesWritable(Binary.fromString(strippedValue).getBytes());
+ case VARCHAR:
+ String value = ((HiveVarcharObjectInspector) inspector).getPrimitiveJavaObject(obj).getValue();
+ return new BytesWritable(Binary.fromString(value).getBytes());
default:
throw new SerDeException("Unknown primitive : " + inspector.getPrimitiveCategory());
}
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java Sun Aug 24 03:43:48 2014
@@ -18,14 +18,6 @@
package org.apache.hadoop.hive.ql.io.sarg;
-import java.util.ArrayDeque;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Deque;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;
@@ -57,6 +49,15 @@ import org.apache.hadoop.hive.serde2.obj
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import java.math.BigDecimal;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Deque;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
/**
* The implementation of SearchArguments.
*/
@@ -947,7 +948,8 @@ final class SearchArgumentImpl implement
literal instanceof Long ||
literal instanceof Double ||
literal instanceof DateWritable ||
- literal instanceof HiveDecimal) {
+ literal instanceof HiveDecimal ||
+ literal instanceof BigDecimal) {
return literal;
} else if (literal instanceof HiveChar ||
literal instanceof HiveVarchar) {
@@ -979,7 +981,8 @@ final class SearchArgumentImpl implement
return PredicateLeaf.Type.FLOAT;
} else if (literal instanceof DateWritable) {
return PredicateLeaf.Type.DATE;
- } else if (literal instanceof HiveDecimal) {
+ } else if (literal instanceof HiveDecimal ||
+ literal instanceof BigDecimal) {
return PredicateLeaf.Type.DECIMAL;
}
throw new IllegalArgumentException("Unknown type for literal " + literal);
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/lockmgr/DbTxnManager.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/lockmgr/DbTxnManager.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/lockmgr/DbTxnManager.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/lockmgr/DbTxnManager.java Sun Aug 24 03:43:48 2014
@@ -165,13 +165,13 @@ public class DbTxnManager extends HiveTx
break;
case TABLE:
+ case DUMMYPARTITION: // in case of dynamic partitioning lock the table
t = output.getTable();
compBuilder.setDbName(t.getDbName());
compBuilder.setTableName(t.getTableName());
break;
case PARTITION:
- case DUMMYPARTITION:
compBuilder.setPartitionName(output.getPartition().getName());
t = output.getPartition().getTable();
compBuilder.setDbName(t.getDbName());
@@ -301,7 +301,10 @@ public class DbTxnManager extends HiveTx
try {
if (txnId > 0) rollbackTxn();
if (lockMgr != null) lockMgr.close();
+ if (client != null) client.close();
} catch (Exception e) {
+ LOG.error("Caught exception " + e.getClass().getName() + " with message <" + e.getMessage()
+ + ">, swallowing as there is nothing we can do with it.");
// Not much we can do about it here.
}
}
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java Sun Aug 24 03:43:48 2014
@@ -89,6 +89,7 @@ import org.apache.hadoop.hive.metastore.
import org.apache.hadoop.hive.metastore.api.Role;
import org.apache.hadoop.hive.metastore.api.RolePrincipalGrant;
import org.apache.hadoop.hive.metastore.api.SerDeInfo;
+import org.apache.hadoop.hive.metastore.api.SetPartitionsStatsRequest;
import org.apache.hadoop.hive.metastore.api.ShowCompactResponse;
import org.apache.hadoop.hive.metastore.api.SkewedInfo;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
@@ -2553,6 +2554,15 @@ private void constructOneLBLocationMap(F
throw new HiveException(e);
}
}
+
+ public boolean setPartitionColumnStatistics(SetPartitionsStatsRequest request) throws HiveException {
+ try {
+ return getMSC().setPartitionColumnStatistics(request);
+ } catch (Exception e) {
+ LOG.debug(StringUtils.stringifyException(e));
+ throw new HiveException(e);
+ }
+ }
public List<ColumnStatisticsObj> getTableColumnStatistics(
String dbName, String tableName, List<String> colNames) throws HiveException {
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/metadata/Partition.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/metadata/Partition.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/metadata/Partition.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/metadata/Partition.java Sun Aug 24 03:43:48 2014
@@ -233,6 +233,10 @@ public class Partition implements Serial
return ret;
}
+ public Path getPartitionPath() {
+ return getDataLocation();
+ }
+
public Path getDataLocation() {
if (table.isPartitioned()) {
return new Path(tPartition.getSd().getLocation());
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/metadata/SessionHiveMetaStoreClient.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/metadata/SessionHiveMetaStoreClient.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/metadata/SessionHiveMetaStoreClient.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/metadata/SessionHiveMetaStoreClient.java Sun Aug 24 03:43:48 2014
@@ -249,6 +249,8 @@ public class SessionHiveMetaStoreClient
+ " is not a directory or unable to create one");
}
}
+ // Make sure location string is in proper format
+ tbl.getSd().setLocation(tblPath.toString());
}
// Add temp table info to current session
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/metadata/VirtualColumn.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/metadata/VirtualColumn.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/metadata/VirtualColumn.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/metadata/VirtualColumn.java Sun Aug 24 03:43:48 2014
@@ -22,25 +22,36 @@ import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
+import java.util.ListIterator;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Iterables;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.common.classification.InterfaceAudience;
import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.io.RecordIdentifier;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+@InterfaceAudience.Private
public class VirtualColumn implements Serializable {
private static final long serialVersionUID = 1L;
- public static VirtualColumn FILENAME = new VirtualColumn("INPUT__FILE__NAME", (PrimitiveTypeInfo)TypeInfoFactory.stringTypeInfo);
- public static VirtualColumn BLOCKOFFSET = new VirtualColumn("BLOCK__OFFSET__INSIDE__FILE", (PrimitiveTypeInfo)TypeInfoFactory.longTypeInfo);
- public static VirtualColumn ROWOFFSET = new VirtualColumn("ROW__OFFSET__INSIDE__BLOCK", (PrimitiveTypeInfo)TypeInfoFactory.longTypeInfo);
+ public static final VirtualColumn FILENAME = new VirtualColumn("INPUT__FILE__NAME", (PrimitiveTypeInfo)TypeInfoFactory.stringTypeInfo);
+ public static final VirtualColumn BLOCKOFFSET = new VirtualColumn("BLOCK__OFFSET__INSIDE__FILE", (PrimitiveTypeInfo)TypeInfoFactory.longTypeInfo);
+ public static final VirtualColumn ROWOFFSET = new VirtualColumn("ROW__OFFSET__INSIDE__BLOCK", (PrimitiveTypeInfo)TypeInfoFactory.longTypeInfo);
- public static VirtualColumn RAWDATASIZE = new VirtualColumn("RAW__DATA__SIZE", (PrimitiveTypeInfo)TypeInfoFactory.longTypeInfo);
+ public static final VirtualColumn RAWDATASIZE = new VirtualColumn("RAW__DATA__SIZE", (PrimitiveTypeInfo)TypeInfoFactory.longTypeInfo);
+ /**
+ * {@link org.apache.hadoop.hive.ql.io.RecordIdentifier}
+ */
+ public static final VirtualColumn ROWID = new VirtualColumn("ROW__ID", RecordIdentifier.StructInfo.typeInfo, true, RecordIdentifier.StructInfo.oi);
/**
* GROUPINGID is used with GROUP BY GROUPINGS SETS, ROLLUP and CUBE.
@@ -49,27 +60,28 @@ public class VirtualColumn implements Se
* set if that column has been aggregated in that row. Otherwise the
* value is "0". Returns the decimal representation of the bit vector.
*/
- public static VirtualColumn GROUPINGID =
+ public static final VirtualColumn GROUPINGID =
new VirtualColumn("GROUPING__ID", (PrimitiveTypeInfo) TypeInfoFactory.intTypeInfo);
- public static VirtualColumn[] VIRTUAL_COLUMNS =
- new VirtualColumn[] {FILENAME, BLOCKOFFSET, ROWOFFSET, RAWDATASIZE, GROUPINGID};
-
- private String name;
- private PrimitiveTypeInfo typeInfo;
- private boolean isHidden = true;
+ public static ImmutableSet<String> VIRTUAL_COLUMN_NAMES =
+ ImmutableSet.of(FILENAME.getName(), BLOCKOFFSET.getName(), ROWOFFSET.getName(),
+ RAWDATASIZE.getName(), GROUPINGID.getName(), ROWID.getName());
- public VirtualColumn() {
- }
+ private final String name;
+ private final TypeInfo typeInfo;
+ private final boolean isHidden;
+ private final ObjectInspector oi;
- public VirtualColumn(String name, PrimitiveTypeInfo typeInfo) {
- this(name, typeInfo, true);
+ private VirtualColumn(String name, PrimitiveTypeInfo typeInfo) {
+ this(name, typeInfo, true,
+ PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(typeInfo));
}
- VirtualColumn(String name, PrimitiveTypeInfo typeInfo, boolean isHidden) {
+ private VirtualColumn(String name, TypeInfo typeInfo, boolean isHidden, ObjectInspector oi) {
this.name = name;
this.typeInfo = typeInfo;
this.isHidden = isHidden;
+ this.oi = oi;
}
public static List<VirtualColumn> getStatsRegistry(Configuration conf) {
@@ -87,26 +99,19 @@ public class VirtualColumn implements Se
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEROWOFFSET)) {
l.add(ROWOFFSET);
}
+ l.add(ROWID);
return l;
}
- public PrimitiveTypeInfo getTypeInfo() {
+ public TypeInfo getTypeInfo() {
return typeInfo;
}
- public void setTypeInfo(PrimitiveTypeInfo typeInfo) {
- this.typeInfo = typeInfo;
- }
-
public String getName() {
return this.name;
}
- public void setName(String name) {
- this.name = name;
- }
-
public boolean isHidden() {
return isHidden;
}
@@ -115,37 +120,58 @@ public class VirtualColumn implements Se
return isHidden;
}
- public void setIsHidden(boolean isHidden) {
- this.isHidden = isHidden;
+ public ObjectInspector getObjectInspector() {
+ return oi;
}
@Override
public boolean equals(Object o) {
- if (o == null) {
- return false;
- }
if (this == o) {
return true;
}
+ if(!(o instanceof VirtualColumn)) {
+ return false;
+ }
VirtualColumn c = (VirtualColumn) o;
return this.name.equals(c.name)
&& this.typeInfo.getTypeName().equals(c.getTypeInfo().getTypeName());
}
-
+ @Override
+ public int hashCode() {
+ int c = 19;
+ c = 31 * name.hashCode() + c;
+ return 31 * typeInfo.getTypeName().hashCode() + c;
+ }
public static Collection<String> removeVirtualColumns(final Collection<String> columns) {
- for(VirtualColumn vcol : VIRTUAL_COLUMNS) {
- columns.remove(vcol.getName());
- }
+ Iterables.removeAll(columns, VIRTUAL_COLUMN_NAMES);
return columns;
}
+ public static List<TypeInfo> removeVirtualColumnTypes(final List<String> columnNames,
+ final List<TypeInfo> columnTypes) {
+ if (columnNames.size() != columnTypes.size()) {
+ throw new IllegalArgumentException("Number of column names in configuration " +
+ columnNames.size() + " differs from column types " + columnTypes.size());
+ }
+
+ int i = 0;
+ ListIterator<TypeInfo> it = columnTypes.listIterator();
+ while(it.hasNext()) {
+ it.next();
+ if (VIRTUAL_COLUMN_NAMES.contains(columnNames.get(i))) {
+ it.remove();
+ }
+ ++i;
+ }
+ return columnTypes;
+ }
+
public static StructObjectInspector getVCSObjectInspector(List<VirtualColumn> vcs) {
List<String> names = new ArrayList<String>(vcs.size());
List<ObjectInspector> inspectors = new ArrayList<ObjectInspector>(vcs.size());
for (VirtualColumn vc : vcs) {
names.add(vc.getName());
- inspectors.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(
- vc.getTypeInfo()));
+ inspectors.add(vc.oi);
}
return ObjectInspectorFactory.getStandardStructObjectInspector(names, inspectors);
}
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java Sun Aug 24 03:43:48 2014
@@ -927,11 +927,9 @@ public class Vectorizer implements Physi
if (desc instanceof ExprNodeColumnDesc) {
ExprNodeColumnDesc c = (ExprNodeColumnDesc) desc;
// Currently, we do not support vectorized virtual columns (see HIVE-5570).
- for (VirtualColumn vc : VirtualColumn.VIRTUAL_COLUMNS) {
- if (c.getColumn().equals(vc.getName())) {
- LOG.info("Cannot vectorize virtual column " + c.getColumn());
- return false;
- }
+ if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(c.getColumn())) {
+ LOG.info("Cannot vectorize virtual column " + c.getColumn());
+ return false;
}
}
String typeName = desc.getTypeInfo().getTypeName();
@@ -1076,10 +1074,8 @@ public class Vectorizer implements Physi
// Not using method column.getIsVirtualCol() because partitioning columns are also
// treated as virtual columns in ColumnInfo.
- for (VirtualColumn vc : VirtualColumn.VIRTUAL_COLUMNS) {
- if (column.getInternalName().equals(vc.getName())) {
+ if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(column.getInternalName())) {
return true;
- }
}
return false;
}
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java Sun Aug 24 03:43:48 2014
@@ -20,6 +20,7 @@ package org.apache.hadoop.hive.ql.optimi
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
+
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.conf.HiveConf;
@@ -67,8 +68,10 @@ import org.apache.hadoop.hive.ql.udf.gen
import org.apache.hadoop.hive.serde.serdeConstants;
import java.util.Collections;
+import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.Stack;
public class StatsRulesProcFactory {
@@ -803,12 +806,13 @@ public class StatsRulesProcFactory {
// statistics object that is combination of statistics from all
// relations involved in JOIN
Statistics stats = new Statistics();
- List<Long> rowCountParents = Lists.newArrayList();
+ Map<String, Long> rowCountParents = new HashMap<String, Long>();
List<Long> distinctVals = Lists.newArrayList();
// 2 relations, multiple attributes
boolean multiAttr = false;
int numAttr = 1;
+ int numParent = parents.size();
Map<String, ColStatistics> joinedColStats = Maps.newHashMap();
Map<Integer, List<String>> joinKeys = Maps.newHashMap();
@@ -818,9 +822,20 @@ public class StatsRulesProcFactory {
ReduceSinkOperator parent = (ReduceSinkOperator) jop.getParentOperators().get(pos);
Statistics parentStats = parent.getStatistics();
- rowCountParents.add(parentStats.getNumRows());
List<ExprNodeDesc> keyExprs = parent.getConf().getKeyCols();
+ // Parent RS may have column statistics from multiple parents.
+ // Populate table alias to row count map, this will be used later to
+ // scale down/up column statistics based on new row count
+ // NOTE: JOIN with UNION as parent of RS will not have table alias
+ // propagated properly. UNION operator does not propagate the table
+ // alias of subqueries properly to expression nodes. Hence union20.q
+ // will have wrong number of rows.
+ Set<String> tableAliases = StatsUtils.getAllTableAlias(parent.getColumnExprMap());
+ for (String tabAlias : tableAliases) {
+ rowCountParents.put(tabAlias, parentStats.getNumRows());
+ }
+
// multi-attribute join key
if (keyExprs.size() > 1) {
multiAttr = true;
@@ -860,12 +875,19 @@ public class StatsRulesProcFactory {
perAttrDVs.add(cs.getCountDistint());
}
}
+
distinctVals.add(getDenominator(perAttrDVs));
perAttrDVs.clear();
}
- for (Long l : distinctVals) {
- denom *= l;
+ if (numAttr > numParent) {
+ // To avoid denominator getting larger and aggressively reducing
+ // number of rows, we will ease out denominator.
+ denom = getEasedOutDenominator(distinctVals);
+ } else {
+ for (Long l : distinctVals) {
+ denom *= l;
+ }
}
} else {
for (List<String> jkeys : joinKeys.values()) {
@@ -890,6 +912,7 @@ public class StatsRulesProcFactory {
Map<String, ExprNodeDesc> colExprMap = jop.getColumnExprMap();
RowSchema rs = jop.getSchema();
List<ColStatistics> outColStats = Lists.newArrayList();
+ Map<String, String> outInTabAlias = new HashMap<String, String>();
for (ColumnInfo ci : rs.getSignature()) {
String key = ci.getInternalName();
ExprNodeDesc end = colExprMap.get(key);
@@ -901,6 +924,7 @@ public class StatsRulesProcFactory {
ColStatistics cs = joinedColStats.get(fqColName);
String outColName = key;
String outTabAlias = ci.getTabAlias();
+ outInTabAlias.put(outTabAlias, tabAlias);
if (cs != null) {
cs.setColumnName(outColName);
cs.setTableAlias(outTabAlias);
@@ -911,7 +935,8 @@ public class StatsRulesProcFactory {
// update join statistics
stats.setColumnStats(outColStats);
- long newRowCount = computeNewRowCount(rowCountParents, denom);
+ long newRowCount = computeNewRowCount(
+ Lists.newArrayList(rowCountParents.values()), denom);
if (newRowCount <= 0 && LOG.isDebugEnabled()) {
newRowCount = 0;
@@ -920,7 +945,8 @@ public class StatsRulesProcFactory {
+ " #Rows of parents: " + rowCountParents.toString() + ". Denominator: " + denom);
}
- updateStatsForJoinType(stats, newRowCount, true, jop.getConf());
+ updateStatsForJoinType(stats, newRowCount, jop.getConf(),
+ rowCountParents, outInTabAlias);
jop.setStatistics(stats);
if (LOG.isDebugEnabled()) {
@@ -966,37 +992,54 @@ public class StatsRulesProcFactory {
return null;
}
+ private Long getEasedOutDenominator(List<Long> distinctVals) {
+ // Exponential back-off for NDVs.
+ // 1) Descending order sort of NDVs
+ // 2) denominator = NDV1 * (NDV2 ^ (1/2)) * (NDV3 ^ (1/4))) * ....
+ Collections.sort(distinctVals, Collections.reverseOrder());
+
+ long denom = distinctVals.get(0);
+ for (int i = 1; i < distinctVals.size(); i++) {
+ denom = (long) (denom * Math.pow(distinctVals.get(i), 1.0 / (1 << i)));
+ }
+
+ return denom;
+ }
+
private void updateStatsForJoinType(Statistics stats, long newNumRows,
- boolean useColStats, JoinDesc conf) {
- long oldRowCount = stats.getNumRows();
- double ratio = (double) newNumRows / (double) oldRowCount;
+ JoinDesc conf, Map<String, Long> rowCountParents,
+ Map<String, String> outInTabAlias) {
stats.setNumRows(newNumRows);
- if (useColStats) {
- List<ColStatistics> colStats = stats.getColumnStats();
- for (ColStatistics cs : colStats) {
- long oldDV = cs.getCountDistint();
- long newDV = oldDV;
-
- // if ratio is greater than 1, then number of rows increases. This can happen
- // when some operators like GROUPBY duplicates the input rows in which case
- // number of distincts should not change. Update the distinct count only when
- // the output number of rows is less than input number of rows.
- if (ratio <= 1.0) {
- newDV = (long) Math.ceil(ratio * oldDV);
- }
- // Assumes inner join
- // TODO: HIVE-5579 will handle different join types
- cs.setNumNulls(0);
- cs.setCountDistint(newDV);
- }
- stats.setColumnStats(colStats);
- long newDataSize = StatsUtils.getDataSizeFromColumnStats(newNumRows, colStats);
- stats.setDataSize(newDataSize);
- } else {
- long newDataSize = (long) (ratio * stats.getDataSize());
- stats.setDataSize(newDataSize);
+ // scale down/up the column statistics based on the changes in number of
+ // rows from each parent. For ex: If there are 2 parents for JOIN operator
+ // with 1st parent having 200 rows and 2nd parent having 2000 rows. Now if
+ // the new number of rows after applying join rule is 10, then the column
+ // stats for columns from 1st parent should be scaled down by 200/10 = 20x
+ // and stats for columns from 2nd parent should be scaled down by 200x
+ List<ColStatistics> colStats = stats.getColumnStats();
+ for (ColStatistics cs : colStats) {
+ long oldRowCount = rowCountParents.get(outInTabAlias.get(cs.getTableAlias()));
+ double ratio = (double) newNumRows / (double) oldRowCount;
+ long oldDV = cs.getCountDistint();
+ long newDV = oldDV;
+
+ // if ratio is greater than 1, then number of rows increases. This can happen
+ // when some operators like GROUPBY duplicates the input rows in which case
+ // number of distincts should not change. Update the distinct count only when
+ // the output number of rows is less than input number of rows.
+ if (ratio <= 1.0) {
+ newDV = (long) Math.ceil(ratio * oldDV);
+ }
+ // Assumes inner join
+ // TODO: HIVE-5579 will handle different join types
+ cs.setNumNulls(0);
+ cs.setCountDistint(newDV);
}
+ stats.setColumnStats(colStats);
+ long newDataSize = StatsUtils
+ .getDataSizeFromColumnStats(newNumRows, colStats);
+ stats.setDataSize(newDataSize);
}
private long computeNewRowCount(List<Long> rowCountParents, long denom) {
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/FromClauseParser.g
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/FromClauseParser.g?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/FromClauseParser.g (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/FromClauseParser.g Sun Aug 24 03:43:48 2014
@@ -144,7 +144,7 @@ fromSource
@init { gParent.pushMsg("from source", state); }
@after { gParent.popMsg(state); }
:
- ((Identifier LPAREN)=> partitionedTableFunction | tableSource | subQuerySource) (lateralView^)*
+ ((Identifier LPAREN)=> partitionedTableFunction | tableSource | subQuerySource | virtualTableSource) (lateralView^)*
;
tableBucketSample
@@ -256,3 +256,46 @@ searchCondition
;
//-----------------------------------------------------------------------------------
+
+//-------- Row Constructor ----------------------------------------------------------
+//in support of SELECT * FROM (VALUES(1,2,3),(4,5,6),...) as FOO(a,b,c) and
+// INSERT INTO <table> (col1,col2,...) VALUES(...),(...),...
+// INSERT INTO <table> (col1,col2,...) SELECT * FROM (VALUES(1,2,3),(4,5,6),...) as Foo(a,b,c)
+valueRowConstructor
+ :
+ LPAREN atomExpression (COMMA atomExpression)* RPAREN -> ^(TOK_VALUE_ROW atomExpression+)
+ ;
+
+valuesTableConstructor
+ :
+ valueRowConstructor (COMMA valueRowConstructor)* -> ^(TOK_VALUES_TABLE valueRowConstructor+)
+ ;
+
+/*
+VALUES(1),(2) means 2 rows, 1 column each.
+VALUES(1,2),(3,4) means 2 rows, 2 columns each.
+VALUES(1,2,3) means 1 row, 3 columns
+*/
+valuesClause
+ :
+ KW_VALUES valuesTableConstructor -> valuesTableConstructor
+ ;
+
+/*
+This represents a clause like this:
+(VALUES(1,2),(2,3)) as VirtTable(col1,col2)
+*/
+virtualTableSource
+ :
+ LPAREN valuesClause RPAREN tableNameColList -> ^(TOK_VIRTUAL_TABLE tableNameColList valuesClause)
+ ;
+/*
+e.g. as VirtTable(col1,col2)
+Note that we only want literals as column names
+*/
+tableNameColList
+ :
+ KW_AS? identifier LPAREN identifier (COMMA identifier)* RPAREN -> ^(TOK_VIRTUAL_TABREF ^(TOK_TABNAME identifier) ^(TOK_COL_NAME identifier+))
+ ;
+
+//-----------------------------------------------------------------------------------
\ No newline at end of file
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/FunctionSemanticAnalyzer.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/FunctionSemanticAnalyzer.java?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/FunctionSemanticAnalyzer.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/FunctionSemanticAnalyzer.java Sun Aug 24 03:43:48 2014
@@ -169,6 +169,7 @@ public class FunctionSemanticAnalyzer ex
try {
String[] qualifiedNameParts = FunctionUtils.getQualifiedFunctionNameParts(functionName);
String dbName = qualifiedNameParts[0];
+ functionName = qualifiedNameParts[1];
database = getDatabase(dbName);
} catch (HiveException e) {
LOG.error(e);
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g?rev=1620103&r1=1620102&r2=1620103&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveLexer.g Sun Aug 24 03:43:48 2014
@@ -292,6 +292,7 @@ KW_TRANSACTIONS: 'TRANSACTIONS';
KW_REWRITE : 'REWRITE';
KW_AUTHORIZATION: 'AUTHORIZATION';
KW_CONF: 'CONF';
+KW_VALUES: 'VALUES';
// Operators
// NOTE: if you add a new function/operator, add it to sysFuncNames so that describe function _FUNC_ will work.