You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2010/11/04 22:33:36 UTC
svn commit: r1031264 - in /hive/trunk: ./
ql/src/java/org/apache/hadoop/hive/ql/exec/
serde/src/java/org/apache/hadoop/hive/serde2/columnar/
serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/
Author: namit
Date: Thu Nov 4 21:33:35 2010
New Revision: 1031264
URL: http://svn.apache.org/viewvc?rev=1031264&view=rev
Log:
HIVE-1751. Optimize ColumnarStructObjectInspector.getStructFieldData()
(Siying Dong via namit)
Modified:
hive/trunk/CHANGES.txt
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java
hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ColumnarStructObjectInspector.java
Modified: hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hive/trunk/CHANGES.txt?rev=1031264&r1=1031263&r2=1031264&view=diff
==============================================================================
--- hive/trunk/CHANGES.txt (original)
+++ hive/trunk/CHANGES.txt Thu Nov 4 21:33:35 2010
@@ -217,6 +217,9 @@ Trunk - Unreleased
HIVE-1761. Support show locks for a particular table
(namit via He Yongqiang)
+ HIVE-1751. Optimize ColumnarStructObjectInspector.getStructFieldData()
+ (Siying Dong via namit)
+
OPTIMIZATIONS
BUG FIXES
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java?rev=1031264&r1=1031263&r2=1031264&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java Thu Nov 4 21:33:35 2010
@@ -33,6 +33,9 @@ public class ExprNodeColumnEvaluator ext
protected ExprNodeColumnDesc expr;
+ transient boolean simpleCase;
+ transient StructObjectInspector inspector;
+ transient StructField field;
transient StructObjectInspector[] inspectors;
transient StructField[] fields;
transient boolean[] unionField;
@@ -47,40 +50,53 @@ public class ExprNodeColumnEvaluator ext
// We need to support field names like KEY.0, VALUE.1 between
// map-reduce boundary.
String[] names = expr.getColumn().split("\\.");
- inspectors = new StructObjectInspector[names.length];
- fields = new StructField[names.length];
- unionField = new boolean[names.length];
- int unionIndex = -1;
-
- for (int i = 0; i < names.length; i++) {
- if (i == 0) {
- inspectors[0] = (StructObjectInspector) rowInspector;
- } else {
- if (unionIndex != -1) {
- inspectors[i] = (StructObjectInspector) (
- (UnionObjectInspector)fields[i-1].getFieldObjectInspector()).
- getObjectInspectors().get(unionIndex);
+ String[] unionfields = names[0].split("\\:");
+ if (names.length == 1 && unionfields.length == 1) {
+ simpleCase = true;
+ inspector = (StructObjectInspector) rowInspector;
+ field = inspector.getStructFieldRef(names[0]);
+ return field.getFieldObjectInspector();
+ }
+ else {
+ simpleCase = false;
+ inspectors = new StructObjectInspector[names.length];
+ fields = new StructField[names.length];
+ unionField = new boolean[names.length];
+ int unionIndex = -1;
+
+ for (int i = 0; i < names.length; i++) {
+ if (i == 0) {
+ inspectors[0] = (StructObjectInspector) rowInspector;
} else {
- inspectors[i] = (StructObjectInspector) fields[i - 1]
- .getFieldObjectInspector();
+ if (unionIndex != -1) {
+ inspectors[i] = (StructObjectInspector) (
+ (UnionObjectInspector)fields[i-1].getFieldObjectInspector()).
+ getObjectInspectors().get(unionIndex);
+ } else {
+ inspectors[i] = (StructObjectInspector) fields[i - 1]
+ .getFieldObjectInspector();
+ }
+ }
+ // to support names like _colx:1._coly
+ unionfields = names[i].split("\\:");
+ fields[i] = inspectors[i].getStructFieldRef(unionfields[0]);
+ if (unionfields.length > 1) {
+ unionIndex = Integer.parseInt(unionfields[1]);
+ unionField[i] = true;
+ } else {
+ unionIndex = -1;
+ unionField[i] = false;
}
}
- // to support names like _colx:1._coly
- String[] unionfields = names[i].split("\\:");
- fields[i] = inspectors[i].getStructFieldRef(unionfields[0]);
- if (unionfields.length > 1) {
- unionIndex = Integer.parseInt(unionfields[1]);
- unionField[i] = true;
- } else {
- unionIndex = -1;
- unionField[i] = false;
- }
+ return fields[names.length - 1].getFieldObjectInspector();
}
- return fields[names.length - 1].getFieldObjectInspector();
}
@Override
public Object evaluate(Object row) throws HiveException {
+ if (simpleCase) {
+ return inspector.getStructFieldData(row, field);
+ }
Object o = row;
for (int i = 0; i < fields.length; i++) {
o = inspectors[i].getStructFieldData(o, fields[i]);
Modified: hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java
URL: http://svn.apache.org/viewvc/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java?rev=1031264&r1=1031263&r2=1031264&view=diff
==============================================================================
--- hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java (original)
+++ hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java Thu Nov 4 21:33:35 2010
@@ -94,7 +94,7 @@ public class ColumnarSerDe implements Se
java.util.ArrayList<Integer> notSkipIDs = ColumnProjectionUtils.getReadColumnIDs(job);
- cachedLazyStruct = new ColumnarStruct(cachedObjectInspector, notSkipIDs);
+ cachedLazyStruct = new ColumnarStruct(cachedObjectInspector, notSkipIDs, serdeParams.getNullSequence());
int size = serdeParams.getColumnTypes().size();
field = new BytesRefWritable[size];
Modified: hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java
URL: http://svn.apache.org/viewvc/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java?rev=1031264&r1=1031263&r2=1031264&view=diff
==============================================================================
--- hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java (original)
+++ hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java Thu Nov 4 21:33:35 2010
@@ -38,49 +38,49 @@ import org.apache.hadoop.io.Text;
* Object get parsed at its initialize time when call
* {@link #init(BytesRefArrayWritable cols)}, while LazyStruct parse fields in a
* lazy way.
- *
+ *
*/
public class ColumnarStruct {
- /**
- * The fields of the struct.
- */
- LazyObject[] fields;
-
private static final Log LOG = LogFactory.getLog(ColumnarStruct.class);
int[] prjColIDs = null; // list of projected column IDs
+ Text nullSequence;
+ int lengthNullSequence;
+
/**
* Construct a ColumnarStruct object with the TypeInfo. It creates the first
* level object at the first place
- *
+ *
* @param oi
* the ObjectInspector representing the type of this LazyStruct.
*/
public ColumnarStruct(ObjectInspector oi) {
- this(oi, null);
+ this(oi, null, null);
}
/**
* Construct a ColumnarStruct object with the TypeInfo. It creates the first
* level object at the first place
- *
+ *
* @param oi
* the ObjectInspector representing the type of this LazyStruct.
* @param notSkippedColumnIDs
* the column ids that should not be skipped
*/
public ColumnarStruct(ObjectInspector oi,
- ArrayList<Integer> notSkippedColumnIDs) {
+ ArrayList<Integer> notSkippedColumnIDs, Text nullSequence) {
List<? extends StructField> fieldRefs = ((StructObjectInspector) oi)
.getAllStructFieldRefs();
int num = fieldRefs.size();
- fields = new LazyObject[num];
- cachedByteArrayRef = new ByteArrayRef[num];
- rawBytesField = new BytesRefWritable[num];
- fieldSkipped = new boolean[num];
- inited = new boolean[num];
+
+ fieldInfoList = new FieldInfo[num];
+
+ if (nullSequence != null) {
+ this.nullSequence = nullSequence;
+ this.lengthNullSequence = nullSequence.getLength();
+ }
// if no columns is set to be skipped, add all columns in
// 'notSkippedColumnIDs'
@@ -91,15 +91,10 @@ public class ColumnarStruct {
}
for (int i = 0; i < num; i++) {
- fields[i] = LazyFactory.createLazyObject(fieldRefs.get(i)
- .getFieldObjectInspector());
- cachedByteArrayRef[i] = new ByteArrayRef();
- if (!notSkippedColumnIDs.contains(i)) {
- fieldSkipped[i] = true;
- inited[i] = true;
- } else {
- inited[i] = false;
- }
+ fieldInfoList[i] = new FieldInfo(
+ LazyFactory.createLazyObject(fieldRefs.get(i)
+ .getFieldObjectInspector()),
+ !notSkippedColumnIDs.contains(i));
}
// maintain a list of non-NULL column IDs
@@ -117,73 +112,110 @@ public class ColumnarStruct {
/**
* Get one field out of the struct.
- *
+ *
* If the field is a primitive field, return the actual object. Otherwise
* return the LazyObject. This is because PrimitiveObjectInspector does not
* have control over the object used by the user - the user simply directly
* use the Object instead of going through Object
* PrimitiveObjectInspector.get(Object).
- *
+ *
* NOTE: separator and nullSequence has to be the same each time this method
* is called. These two parameters are used only once to parse each record.
- *
+ *
* @param fieldID
* The field ID
* @param nullSequence
* The sequence for null value
* @return The field as a LazyObject
*/
- public Object getField(int fieldID, Text nullSequence) {
- return uncheckedGetField(fieldID, nullSequence);
+ public Object getField(int fieldID) {
+ return fieldInfoList[fieldID].uncheckedGetField();
}
- /*
- * use an array instead of only one object in case in future hive does not do
- * the byte copy.
- */
- ByteArrayRef[] cachedByteArrayRef = null;
- BytesRefWritable[] rawBytesField = null;
- boolean[] inited = null;
- boolean[] fieldSkipped = null;
-
- /**
- * Get the field out of the row without checking parsed. This is called by
- * both getField and getFieldsAsList.
- *
- * @param fieldID
- * The id of the field starting from 0.
- * @param nullSequence
- * The sequence representing NULL value.
- * @return The value of the field
- */
- protected Object uncheckedGetField(int fieldID, Text nullSequence) {
- if (fieldSkipped[fieldID]) {
- return null;
- }
- if (!inited[fieldID]) {
- BytesRefWritable passedInField = rawBytesField[fieldID];
- try {
- cachedByteArrayRef[fieldID].setData(passedInField.getData());
- } catch (IOException e) {
- throw new RuntimeException(e);
+ class FieldInfo {
+ LazyObject field;
+ /*
+ * use an array instead of only one object in case in future hive does not do
+ * the byte copy.
+ */
+ ByteArrayRef cachedByteArrayRef;
+ BytesRefWritable rawBytesField;
+ boolean inited;
+ boolean fieldSkipped;
+
+ public FieldInfo(LazyObject lazyObject, boolean fieldSkipped) {
+ field = lazyObject;
+ cachedByteArrayRef = new ByteArrayRef();
+ if (fieldSkipped) {
+ this.fieldSkipped = true;
+ inited = true;
+ } else {
+ inited = false;
}
- fields[fieldID].init(cachedByteArrayRef[fieldID], passedInField
- .getStart(), passedInField.getLength());
- inited[fieldID] = true;
}
- byte[] data = cachedByteArrayRef[fieldID].getData();
- int fieldLen = rawBytesField[fieldID].length;
+ /*
+ * ============================ [PERF] ===================================
+ * This function is called for every row. Setting up the selected/projected
+ * columns at the first call, and don't do that for the following calls.
+ * Ideally this should be done in the constructor where we don't need to
+ * branch in the function for each row.
+ * =========================================================================
+ */
+ public void init(BytesRefWritable col) {
+ if (col != null) {
+ rawBytesField= col;
+ inited = false;
+ } else {
+ // select columns that actually do not exist in the file.
+ fieldSkipped = true;
+ }
+ }
+
+ /**
+ * Get the field out of the row without checking parsed. This is called by
+ * both getField and getFieldsAsList.
+ *
+ * @param fieldID
+ * The id of the field starting from 0.
+ * @param nullSequence
+ * The sequence representing NULL value.
+ * @return The value of the field
+ */
+ protected Object uncheckedGetField() {
+ if (fieldSkipped) {
+ return null;
+ }
+ if (!inited) {
+ try {
+ cachedByteArrayRef.setData(rawBytesField.getData());
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ field.init(cachedByteArrayRef, rawBytesField
+ .getStart(), rawBytesField.getLength());
+ inited = true;
+ }
- if (fieldLen == nullSequence.getLength()
- && LazyUtils.compare(data, rawBytesField[fieldID].getStart(), fieldLen,
- nullSequence.getBytes(), 0, nullSequence.getLength()) == 0) {
- return null;
- }
- return fields[fieldID].getObject();
+ int fieldLen = rawBytesField.length;
+ if (fieldLen == lengthNullSequence) {
+ byte[] data = cachedByteArrayRef.getData();
+
+ if (LazyUtils.compare(data, rawBytesField.getStart(), fieldLen,
+ nullSequence.getBytes(), 0, lengthNullSequence) == 0) {
+ return null;
+ }
+ }
+
+ return field.getObject();
+
+ }
}
+ FieldInfo[] fieldInfoList = null;
+
+
/*
* ============================ [PERF] ===================================
* This function is called for every row. Setting up the selected/projected
@@ -196,11 +228,10 @@ public class ColumnarStruct {
for (int i = 0; i < prjColIDs.length; ++i) {
int fieldIndex = prjColIDs[i];
if (fieldIndex < cols.size()) {
- rawBytesField[fieldIndex] = cols.unCheckedGet(fieldIndex);
- inited[fieldIndex] = false;
+ fieldInfoList[fieldIndex].init(cols.unCheckedGet(fieldIndex));
} else {
// select columns that actually do not exist in the file.
- fieldSkipped[fieldIndex] = true;
+ fieldInfoList[fieldIndex].init(null);
}
}
}
@@ -209,19 +240,19 @@ public class ColumnarStruct {
/**
* Get the values of the fields as an ArrayList.
- *
+ *
* @param nullSequence
* The sequence for the NULL value
* @return The values of the fields as an ArrayList.
*/
- public ArrayList<Object> getFieldsAsList(Text nullSequence) {
+ public ArrayList<Object> getFieldsAsList() {
if (cachedList == null) {
cachedList = new ArrayList<Object>();
} else {
cachedList.clear();
}
- for (int i = 0; i < fields.length; i++) {
- cachedList.add(uncheckedGetField(i, nullSequence));
+ for (int i = 0; i < fieldInfoList.length; i++) {
+ cachedList.add(fieldInfoList[i].uncheckedGetField());
}
return cachedList;
}
Modified: hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ColumnarStructObjectInspector.java
URL: http://svn.apache.org/viewvc/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ColumnarStructObjectInspector.java?rev=1031264&r1=1031263&r2=1031264&view=diff
==============================================================================
--- hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ColumnarStructObjectInspector.java (original)
+++ hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ColumnarStructObjectInspector.java Thu Nov 4 21:33:35 2010
@@ -29,10 +29,10 @@ import org.apache.hadoop.io.Text;
/**
* ColumnarStructObjectInspector works on struct data that is stored in
* ColumnarStruct.
- *
+ *
* The names of the struct fields and the internal structure of the struct
* fields are specified in the ctor of the ColumnarStructObjectInspector.
- *
+ *
* Always use the ObjectInspectorFactory to create new ObjectInspector objects,
* instead of directly creating an instance of this class.
*/
@@ -144,7 +144,7 @@ class ColumnarStructObjectInspector exte
int fieldID = f.getFieldID();
assert (fieldID >= 0 && fieldID < fields.size());
- return struct.getField(fieldID, nullSequence);
+ return struct.getField(fieldID);
}
@Override
@@ -153,6 +153,6 @@ class ColumnarStructObjectInspector exte
return null;
}
ColumnarStruct struct = (ColumnarStruct) data;
- return struct.getFieldsAsList(nullSequence);
+ return struct.getFieldsAsList();
}
}