You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by zs...@apache.org on 2010/01/06 02:29:42 UTC
svn commit: r896296 - in /hadoop/hive/trunk: ./
contrib/src/java/org/apache/hadoop/hive/contrib/serde2/
contrib/src/test/queries/clientpositive/
contrib/src/test/results/clientpositive/
Author: zshao
Date: Wed Jan 6 01:29:42 2010
New Revision: 896296
URL: http://svn.apache.org/viewvc?rev=896296&view=rev
Log:
HIVE-1028. Fix tinyint in TypedBytes. (Namit Jain via zshao)
Added:
hadoop/hive/trunk/contrib/src/test/queries/clientpositive/serde_typedbytes4.q
hadoop/hive/trunk/contrib/src/test/results/clientpositive/serde_typedbytes4.q.out
Modified:
hadoop/hive/trunk/CHANGES.txt
hadoop/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/TypedBytesSerDe.java
Modified: hadoop/hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/CHANGES.txt?rev=896296&r1=896295&r2=896296&view=diff
==============================================================================
--- hadoop/hive/trunk/CHANGES.txt (original)
+++ hadoop/hive/trunk/CHANGES.txt Wed Jan 6 01:29:42 2010
@@ -396,6 +396,8 @@
HIVE-1026 Fix test sample6.q to make it deterministic
(John Sichi via namit)
+ HIVE-1028. Fix tinyint in TypedBytes. (Namit Jain via zshao)
+
Release 0.4.0 - Unreleased
INCOMPATIBLE CHANGES
Modified: hadoop/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/TypedBytesSerDe.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/TypedBytesSerDe.java?rev=896296&r1=896295&r2=896296&view=diff
==============================================================================
--- hadoop/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/TypedBytesSerDe.java (original)
+++ hadoop/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/TypedBytesSerDe.java Wed Jan 6 01:29:42 2010
@@ -65,30 +65,30 @@
/**
* TypedBytesSerDe uses typed bytes to serialize/deserialize.
- *
+ *
* More info on the typedbytes stuff that Dumbo uses.
- * http://issues.apache.org/jira/browse/HADOOP-1722
+ * http://issues.apache.org/jira/browse/HADOOP-1722
* A fast python decoder for this, which is apparently 25% faster than the python version is available at
- * http://github.com/klbostee/ctypedbytes/tree/master
+ * http://github.com/klbostee/ctypedbytes/tree/master
*/
public class TypedBytesSerDe implements SerDe {
public static final Log LOG = LogFactory.getLog(TypedBytesSerDe.class.getName());
-
+
int numColumns;
StructObjectInspector rowOI;
ArrayList<Object> row;
-
+
BytesWritable serializeBytesWritable;
NonSyncDataOutputBuffer barrStr;
TypedBytesWritableOutput tbOut;
-
+
NonSyncDataInputBuffer inBarrStr;
TypedBytesWritableInput tbIn;
-
+
List<String> columnNames;
List<TypeInfo> columnTypes;
-
+
@Override
public void initialize(Configuration conf, Properties tbl)
throws SerDeException {
@@ -97,10 +97,10 @@
serializeBytesWritable = new BytesWritable();
barrStr = new NonSyncDataOutputBuffer();
tbOut = new TypedBytesWritableOutput(barrStr);
-
+
inBarrStr = new NonSyncDataInputBuffer();
tbIn = new TypedBytesWritableInput(inBarrStr);
-
+
// Read the configuration parameters
String columnNameProperty = tbl.getProperty(Constants.LIST_COLUMNS);
String columnTypeProperty = tbl.getProperty(Constants.LIST_COLUMN_TYPES);
@@ -115,29 +115,29 @@
}
assert columnNames.size() == columnTypes.size();
- numColumns = columnNames.size();
-
+ numColumns = columnNames.size();
+
// All columns have to be primitive.
for (int c = 0; c < numColumns; c++) {
if (columnTypes.get(c).getCategory() != Category.PRIMITIVE) {
- throw new SerDeException(getClass().getName()
- + " only accepts primitive columns, but column[" + c
+ throw new SerDeException(getClass().getName()
+ + " only accepts primitive columns, but column[" + c
+ "] named " + columnNames.get(c) + " has category "
+ columnTypes.get(c).getCategory());
}
}
-
+
// Constructing the row ObjectInspector:
- // The row consists of some string columns, each column will be a java
+ // The row consists of some string columns, each column will be a java
// String object.
List<ObjectInspector> columnOIs = new ArrayList<ObjectInspector>(columnNames.size());
for (int c = 0; c < numColumns; c++) {
columnOIs.add(TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(columnTypes.get(c)));
}
-
- // StandardStruct uses ArrayList to store the row.
+
+ // StandardStruct uses ArrayList to store the row.
rowOI = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnOIs);
-
+
// Constructing the row object, etc, which will be reused for all rows.
row = new ArrayList<Object>(numColumns);
for (int c = 0; c < numColumns; c++) {
@@ -154,12 +154,12 @@
public Class<? extends Writable> getSerializedClass() {
return BytesWritable.class;
}
-
+
@Override
public Object deserialize(Writable blob) throws SerDeException {
BytesWritable data = (BytesWritable)blob;
- inBarrStr.reset(data.get(), 0, data.getSize());
+ inBarrStr.reset(data.get(), 0, data.getSize());
try {
@@ -169,11 +169,11 @@
// The next byte should be the marker
assert tbIn.readTypeCode() == Type.ENDOFRECORD;
-
+
} catch (IOException e) {
throw new SerDeException(e);
}
-
+
return row;
}
@@ -237,9 +237,9 @@
}
}
// Currently, deserialization of complex types is not supported
- case LIST:
+ case LIST:
case MAP:
- case STRUCT:
+ case STRUCT:
default: {
throw new RuntimeException("Unsupported category: " + type.getCategory());
}
@@ -247,7 +247,7 @@
}
-
+
@Override
public Writable serialize(Object obj, ObjectInspector objInspector)
throws SerDeException {
@@ -255,25 +255,23 @@
barrStr.reset();
StructObjectInspector soi = (StructObjectInspector)objInspector;
List<? extends StructField> fields = soi.getAllStructFieldRefs();
-
+
for (int i = 0; i < numColumns; i++) {
Object o = soi.getStructFieldData(obj, fields.get(i));
- ObjectInspector oi = fields.get(i).getFieldObjectInspector();
+ ObjectInspector oi = fields.get(i).getFieldObjectInspector();
serializeField(o, oi, row.get(i));
}
-
+
// End of the record is part of the data
tbOut.writeEndOfRecord();
-
+
serializeBytesWritable.set(barrStr.getData(), 0, barrStr.getLength());
} catch (IOException e) {
throw new SerDeException(e.getMessage());
}
return serializeBytesWritable;
}
-
- private byte[] tmpByteArr = new byte[1];
-
+
private void serializeField(Object o, ObjectInspector oi, Object reuse) throws IOException {
switch (oi.getCategory()) {
case PRIMITIVE: {
@@ -291,9 +289,8 @@
}
case BYTE: {
ByteObjectInspector boi = (ByteObjectInspector)poi;
- BytesWritable r = reuse == null ? new BytesWritable() : (BytesWritable)reuse;
- tmpByteArr[0] = boi.get(o);
- r.set(tmpByteArr, 0, 1);
+ ByteWritable r = reuse == null ? new ByteWritable() : (ByteWritable)reuse;
+ r.set(boi.get(o));
tbOut.write(r);
return;
}
@@ -343,13 +340,13 @@
}
}
}
- case LIST:
+ case LIST:
case MAP:
case STRUCT: {
// For complex object, serialize to JSON format
String s = SerDeUtils.getJSONString(o, oi);
Text t = reuse == null ? new Text() : (Text)reuse;
-
+
// convert to Text and write it
t.set(s);
tbOut.write(t);
Added: hadoop/hive/trunk/contrib/src/test/queries/clientpositive/serde_typedbytes4.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/contrib/src/test/queries/clientpositive/serde_typedbytes4.q?rev=896296&view=auto
==============================================================================
--- hadoop/hive/trunk/contrib/src/test/queries/clientpositive/serde_typedbytes4.q (added)
+++ hadoop/hive/trunk/contrib/src/test/queries/clientpositive/serde_typedbytes4.q Wed Jan 6 01:29:42 2010
@@ -0,0 +1,31 @@
+add jar ../build/contrib/hive_contrib.jar;
+
+drop table dest1;
+CREATE TABLE dest1(key STRING, value STRING) STORED AS TEXTFILE;
+
+EXPLAIN
+FROM (
+ FROM src
+ SELECT TRANSFORM(cast(src.key as tinyint), src.value) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+ RECORDWRITER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordWriter'
+ USING '/bin/cat'
+ AS (tkey, tvalue) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+ RECORDREADER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordReader'
+ WHERE key < 100
+) tmap
+INSERT OVERWRITE TABLE dest1 SELECT tkey, tvalue ORDER by tkey, tvalue;
+
+FROM (
+ FROM src
+ SELECT TRANSFORM(cast(src.key as tinyint), src.value) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+ RECORDWRITER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordWriter'
+ USING '/bin/cat'
+ AS (tkey, tvalue) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+ RECORDREADER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordReader'
+ WHERE key < 100
+) tmap
+INSERT OVERWRITE TABLE dest1 SELECT tkey, tvalue ORDER by tkey, tvalue;
+
+SELECT dest1.* FROM dest1;
+
+drop table dest1;
Added: hadoop/hive/trunk/contrib/src/test/results/clientpositive/serde_typedbytes4.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/contrib/src/test/results/clientpositive/serde_typedbytes4.q.out?rev=896296&view=auto
==============================================================================
--- hadoop/hive/trunk/contrib/src/test/results/clientpositive/serde_typedbytes4.q.out (added)
+++ hadoop/hive/trunk/contrib/src/test/results/clientpositive/serde_typedbytes4.q.out Wed Jan 6 01:29:42 2010
@@ -0,0 +1,232 @@
+PREHOOK: query: drop table dest1
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table dest1
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE dest1(key STRING, value STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE dest1(key STRING, value STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@dest1
+PREHOOK: query: EXPLAIN
+FROM (
+ FROM src
+ SELECT TRANSFORM(cast(src.key as tinyint), src.value) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+ RECORDWRITER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordWriter'
+ USING '/bin/cat'
+ AS (tkey, tvalue) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+ RECORDREADER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordReader'
+ WHERE key < 100
+) tmap
+INSERT OVERWRITE TABLE dest1 SELECT tkey, tvalue ORDER by tkey, tvalue
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN
+FROM (
+ FROM src
+ SELECT TRANSFORM(cast(src.key as tinyint), src.value) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+ RECORDWRITER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordWriter'
+ USING '/bin/cat'
+ AS (tkey, tvalue) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+ RECORDREADER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordReader'
+ WHERE key < 100
+) tmap
+INSERT OVERWRITE TABLE dest1 SELECT tkey, tvalue ORDER by tkey, tvalue
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TRANSFORM (TOK_EXPLIST (TOK_FUNCTION TOK_TINYINT (. (TOK_TABLE_OR_COL src) key)) (. (TOK_TABLE_OR_COL src) value)) (TOK_SERDE (TOK_SERDENAME 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe')) (TOK_RECORDWRITER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordWriter') '/bin/cat' (TOK_SERDE (TOK_SERDENAME 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe')) (TOK_RECORDREADER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordReader') (TOK_ALIASLIST tkey tvalue)))) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 100)))) tmap)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL tkey)) (TOK_SELEXPR (TOK_TABLE_OR_COL tvalue))) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL tkey)) (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL tvalue)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ tmap:src
+ TableScan
+ alias: src
+ Filter Operator
+ predicate:
+ expr: (key < 100)
+ type: boolean
+ Filter Operator
+ predicate:
+ expr: (key < 100)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: UDFToByte(key)
+ type: tinyint
+ expr: value
+ type: string
+ outputColumnNames: _col0, _col1
+ Transform Operator
+ command: /bin/cat
+ output info:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ Reduce Operator Tree:
+ Extract
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: dest1
+
+ Stage: Stage-0
+ Move Operator
+ tables:
+ replace: true
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: dest1
+
+
+PREHOOK: query: FROM (
+ FROM src
+ SELECT TRANSFORM(cast(src.key as tinyint), src.value) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+ RECORDWRITER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordWriter'
+ USING '/bin/cat'
+ AS (tkey, tvalue) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+ RECORDREADER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordReader'
+ WHERE key < 100
+) tmap
+INSERT OVERWRITE TABLE dest1 SELECT tkey, tvalue ORDER by tkey, tvalue
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@dest1
+POSTHOOK: query: FROM (
+ FROM src
+ SELECT TRANSFORM(cast(src.key as tinyint), src.value) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+ RECORDWRITER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordWriter'
+ USING '/bin/cat'
+ AS (tkey, tvalue) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+ RECORDREADER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordReader'
+ WHERE key < 100
+) tmap
+INSERT OVERWRITE TABLE dest1 SELECT tkey, tvalue ORDER by tkey, tvalue
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@dest1
+PREHOOK: query: SELECT dest1.* FROM dest1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dest1
+PREHOOK: Output: file:/data/users/njain/hive1/hive1/build/ql/tmp/1776839815/10000
+POSTHOOK: query: SELECT dest1.* FROM dest1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dest1
+POSTHOOK: Output: file:/data/users/njain/hive1/hive1/build/ql/tmp/1776839815/10000
+0 val_0
+0 val_0
+0 val_0
+10 val_10
+11 val_11
+12 val_12
+12 val_12
+15 val_15
+15 val_15
+17 val_17
+18 val_18
+18 val_18
+19 val_19
+2 val_2
+20 val_20
+24 val_24
+24 val_24
+26 val_26
+26 val_26
+27 val_27
+28 val_28
+30 val_30
+33 val_33
+34 val_34
+35 val_35
+35 val_35
+35 val_35
+37 val_37
+37 val_37
+4 val_4
+41 val_41
+42 val_42
+42 val_42
+43 val_43
+44 val_44
+47 val_47
+5 val_5
+5 val_5
+5 val_5
+51 val_51
+51 val_51
+53 val_53
+54 val_54
+57 val_57
+58 val_58
+58 val_58
+64 val_64
+65 val_65
+66 val_66
+67 val_67
+67 val_67
+69 val_69
+70 val_70
+70 val_70
+70 val_70
+72 val_72
+72 val_72
+74 val_74
+76 val_76
+76 val_76
+77 val_77
+78 val_78
+8 val_8
+80 val_80
+82 val_82
+83 val_83
+83 val_83
+84 val_84
+84 val_84
+85 val_85
+86 val_86
+87 val_87
+9 val_9
+90 val_90
+90 val_90
+90 val_90
+92 val_92
+95 val_95
+95 val_95
+96 val_96
+97 val_97
+97 val_97
+98 val_98
+98 val_98
+PREHOOK: query: drop table dest1
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table dest1
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Output: default@dest1