You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by cw...@apache.org on 2018/08/29 20:51:27 UTC
[1/2] hive git commit: HIVE-20225: SerDe to support Teradata Binary
Format (Lu Li via cws)
Repository: hive
Updated Branches:
refs/heads/master cf5486dd3 -> b8d82844b
http://git-wip-us.apache.org/repos/asf/hive/blob/b8d82844/serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinarySerde.java
----------------------------------------------------------------------
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinarySerde.java b/serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinarySerde.java
new file mode 100644
index 0000000..ccf5f44
--- /dev/null
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinarySerde.java
@@ -0,0 +1,597 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.serde2.teradata;
+
+import com.google.common.collect.ImmutableMap;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.common.type.Timestamp;
+import org.apache.hadoop.hive.serde2.io.ByteWritable;
+import org.apache.hadoop.hive.serde2.io.DateWritableV2;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
+import org.apache.hadoop.hive.serde2.io.ShortWritable;
+import org.apache.hadoop.hive.serde2.io.TimestampWritableV2;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.AbstractSerDe;
+import org.apache.hadoop.hive.serde2.SerDeException;
+import org.apache.hadoop.hive.serde2.SerDeSpec;
+import org.apache.hadoop.hive.serde2.SerDeStats;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
+import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.hive.common.type.Date;
+
+import javax.annotation.Nullable;
+import java.io.ByteArrayInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+
+import static java.lang.String.format;
+
+/**
+ * https://cwiki.apache.org/confluence/display/Hive/TeradataBinarySerde.
+ * TeradataBinarySerde handles the serialization and deserialization of Teradata Binary Record
+ * passed from TeradataBinaryRecordReader.
+ *
+ * The Teradata Binary Record uses little-endian to handle the SHORT, INT, LONG, DOUBLE...
+ * We extend SwappedDataInputStream to handle these types and extend to handle the Teradata
+ * specific types like VARCHAR, CHAR, TIMESTAMP, DATE...
+ *
+ * Currently we support 11 Teradata data types: VARCHAR ,INTEGER, TIMESTAMP, FLOAT, DATE,
+ * BYTEINT, BIGINT, CHARACTER, DECIMAL, SMALLINT, VARBYTE.
+ * The mapping between Teradata data type and Hive data type is
+ * Teradata Data Type: Hive Data Type
+ * VARCHAR: VARCHAR,
+ * INTEGER: INT,
+ * TIMESTAMP: TIMESTAMP,
+ * FLOAT: DOUBLE,
+ * DATE: DATE,
+ * BYTEINT: TINYINT ,
+ * BIGINT: BIGINT,
+ * CHAR: CHAR,
+ * DECIMAL: DECIMAL,
+ * SMALLINT: SMALLINT,
+ * VARBYTE: BINARY.
+ *
+ * TeradataBinarySerde currently doesn't support complex types like MAP, ARRAY and STRUCT.
+ */
+@SerDeSpec(schemaProps = { serdeConstants.LIST_COLUMNS,
+ serdeConstants.LIST_COLUMN_TYPES }) public class TeradataBinarySerde extends AbstractSerDe {
+ private static final Log LOG = LogFactory.getLog(TeradataBinarySerde.class);
+
+ public static final String TD_SCHEMA_LITERAL = "teradata.schema.literal";
+
+ private StructObjectInspector rowOI;
+ private ArrayList<Object> row;
+ private byte[] inForNull;
+
+ private int numCols;
+ private List<String> columnNames;
+ private List<TypeInfo> columnTypes;
+
+ private TeradataBinaryDataOutputStream out;
+ private BytesWritable serializeBytesWritable;
+ private byte[] outForNull;
+
+ public static final String TD_TIMESTAMP_PRECISION = "teradata.timestamp.precision";
+ private int timestampPrecision;
+ private static final int DEFAULT_TIMESTAMP_BYTE_NUM = 19;
+ private static final String DEFAULT_TIMESTAMP_PRECISION = "6";
+
+ public static final String TD_CHAR_SET = "teradata.char.charset";
+ private String charCharset;
+ private static final String DEFAULT_CHAR_CHARSET = "UNICODE";
+ private static final Map<String, Integer> CHARSET_TO_BYTE_NUM = ImmutableMap.of("LATIN", 2, "UNICODE", 3);
+
+ /**
+ * Initialize the HiveSerializer.
+ *
+ * @param conf
+ * System properties. Can be null in compile time
+ * @param tbl
+ * table properties
+ * @throws SerDeException
+ */
+ @Override public void initialize(@Nullable Configuration conf, Properties tbl) throws SerDeException {
+ columnNames = Arrays.asList(tbl.getProperty(serdeConstants.LIST_COLUMNS).split(","));
+
+ String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES);
+ LOG.debug(serdeConstants.LIST_COLUMN_TYPES + ": " + columnTypeProperty);
+ if (columnTypeProperty.length() == 0) {
+ columnTypes = new ArrayList<TypeInfo>();
+ } else {
+ columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
+ }
+
+ assert columnNames.size() == columnTypes.size();
+ numCols = columnNames.size();
+
+ // get the configured teradata timestamp precision
+ // you can configure to generate timestamp of different precision in the binary file generated by TPT/BTEQ
+ timestampPrecision = Integer.parseInt(tbl.getProperty(TD_TIMESTAMP_PRECISION, DEFAULT_TIMESTAMP_PRECISION));
+
+ // get the configured teradata char charset
+ // in TD, latin charset will have 2 bytes per char and unicode will have 3 bytes per char
+ charCharset = tbl.getProperty(TD_CHAR_SET, DEFAULT_CHAR_CHARSET);
+ if (!CHARSET_TO_BYTE_NUM.containsKey(charCharset)) {
+ throw new SerDeException(
+ format("%s isn't supported in Teradata Char Charset %s", charCharset, CHARSET_TO_BYTE_NUM.keySet()));
+ }
+
+ // All columns have to be primitive.
+ // Constructing the row ObjectInspector:
+ List<ObjectInspector> columnOIs = new ArrayList<ObjectInspector>(numCols);
+ for (int i = 0; i < numCols; i++) {
+ if (columnTypes.get(i).getCategory() != ObjectInspector.Category.PRIMITIVE) {
+ throw new SerDeException(
+ getClass().getName() + " only accepts primitive columns, but column[" + i + "] named " + columnNames.get(i)
+ + " has category " + columnTypes.get(i).getCategory());
+ }
+ columnOIs.add(TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(columnTypes.get(i)));
+ }
+
+ rowOI = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnOIs);
+
+ // Constructing the row object and will be reused for all rows
+ row = new ArrayList<Object>(numCols);
+ for (int i = 0; i < numCols; i++) {
+ row.add(null);
+ }
+
+ // Initialize vars related to Null Array which represents the null bitmap
+ int byteNumForNullArray = (numCols / 8) + ((numCols % 8 == 0) ? 0 : 1);
+ LOG.debug(format("The Null Bytes for each record will have %s bytes", byteNumForNullArray));
+ inForNull = new byte[byteNumForNullArray];
+
+ out = new TeradataBinaryDataOutputStream();
+ serializeBytesWritable = new BytesWritable();
+ outForNull = new byte[byteNumForNullArray];
+ }
+
+ /**
+ * Returns the Writable class that would be returned by the serialize method.
+ * This is used to initialize SequenceFile header.
+ */
+ @Override public Class<? extends Writable> getSerializedClass() {
+ return ByteWritable.class;
+ }
+
+ /**
+ * Serialize an object by navigating inside the Object with the
+ * ObjectInspector. In most cases, the return value of this function will be
+ * constant since the function will reuse the Writable object. If the client
+ * wants to keep a copy of the Writable, the client needs to clone the
+ * returned value.
+
+ * @param obj
+ * @param objInspector
+ */
+ @Override public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException {
+ try {
+ out.reset();
+ final StructObjectInspector outputRowOI = (StructObjectInspector) objInspector;
+ final List<? extends StructField> fieldRefs = outputRowOI.getAllStructFieldRefs();
+
+ if (fieldRefs.size() != numCols) {
+ throw new SerDeException(
+ "Cannot serialize the object because there are " + fieldRefs.size() + " fieldRefs but the table defined "
+ + numCols + " columns.");
+ }
+
+ // Fully refresh the Null Array to write into the out
+ for (int i = 0; i < numCols; i++) {
+ Object objectForField = outputRowOI.getStructFieldData(obj, fieldRefs.get(i));
+ if (objectForField == null) {
+ outForNull[i / 8] = (byte) (outForNull[i / 8] | (0x01 << (7 - (i % 8))));
+ } else {
+ outForNull[i / 8] = (byte) (outForNull[i / 8] & ~(0x01 << (7 - (i % 8))));
+ }
+ }
+ out.write(outForNull);
+
+ // serialize each field using FieldObjectInspector
+ for (int i = 0; i < numCols; i++) {
+ Object objectForField = outputRowOI.getStructFieldData(obj, fieldRefs.get(i));
+ serializeField(objectForField, fieldRefs.get(i).getFieldObjectInspector(), columnTypes.get(i));
+ }
+
+ serializeBytesWritable.set(out.toByteArray(), 0, out.size());
+ return serializeBytesWritable;
+ } catch (IOException e) {
+ throw new SerDeException(e);
+ }
+ }
+
+ private void serializeField(Object objectForField, ObjectInspector oi, TypeInfo ti)
+ throws IOException, SerDeException {
+ switch (oi.getCategory()) {
+ case PRIMITIVE:
+ PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
+ switch (poi.getPrimitiveCategory()) {
+ // Teradata Type: BYTEINT
+ case BYTE:
+ ByteObjectInspector boi = (ByteObjectInspector) poi;
+ byte b = 0;
+ if (objectForField != null) {
+ b = boi.get(objectForField);
+ }
+ out.write(b);
+ return;
+ // Teradata Type: SMALLINT
+ case SHORT:
+ ShortObjectInspector spoi = (ShortObjectInspector) poi;
+ short s = 0;
+ if (objectForField != null) {
+ s = spoi.get(objectForField);
+ }
+ out.writeShort(s);
+ return;
+ // Teradata Type: INT
+ case INT:
+ IntObjectInspector ioi = (IntObjectInspector) poi;
+ int i = 0;
+ if (objectForField != null) {
+ i = ioi.get(objectForField);
+ }
+ out.writeInt(i);
+ return;
+ // Teradata Type: BIGINT
+ case LONG:
+ LongObjectInspector loi = (LongObjectInspector) poi;
+ long l = 0;
+ if (objectForField != null) {
+ l = loi.get(objectForField);
+ }
+ out.writeLong(l);
+ return;
+ // Teradata Type: FLOAT
+ case DOUBLE:
+ DoubleObjectInspector doi = (DoubleObjectInspector) poi;
+ double d = 0;
+ if (objectForField != null) {
+ d = doi.get(objectForField);
+ }
+ out.writeDouble(d);
+ return;
+ // Teradata Type: VARCHAR
+ case VARCHAR:
+ HiveVarcharObjectInspector hvoi = (HiveVarcharObjectInspector) poi;
+ HiveVarcharWritable hv = hvoi.getPrimitiveWritableObject(objectForField);
+ // assert the length of varchar record fits into the table definition
+ if (hv != null) {
+ assert ((VarcharTypeInfo) ti).getLength() >= hv.getHiveVarchar().getCharacterLength();
+ }
+ out.writeVarChar(hv);
+ return;
+ // Teradata Type: TIMESTAMP
+ case TIMESTAMP:
+ TimestampObjectInspector tsoi = (TimestampObjectInspector) poi;
+ TimestampWritableV2 ts = tsoi.getPrimitiveWritableObject(objectForField);
+ out.writeTimestamp(ts, getTimeStampByteNum(timestampPrecision));
+ return;
+ // Teradata Type: DATE
+ case DATE:
+ DateObjectInspector dtoi = (DateObjectInspector) poi;
+ DateWritableV2 dw = dtoi.getPrimitiveWritableObject(objectForField);
+ out.writeDate(dw);
+ return;
+ // Teradata Type: CHAR
+ case CHAR:
+ HiveCharObjectInspector coi = (HiveCharObjectInspector) poi;
+ HiveCharWritable hc = coi.getPrimitiveWritableObject(objectForField);
+ // assert the length of char record fits into the table definition
+ if (hc != null) {
+ assert ((CharTypeInfo) ti).getLength() >= hc.getHiveChar().getCharacterLength();
+ }
+ out.writeChar(hc, getCharByteNum(charCharset) * ((CharTypeInfo) ti).getLength());
+ return;
+ // Teradata Type: DECIMAL
+ case DECIMAL:
+ DecimalTypeInfo dtype = (DecimalTypeInfo) ti;
+ int precision = dtype.precision();
+ int scale = dtype.scale();
+ HiveDecimalObjectInspector hdoi = (HiveDecimalObjectInspector) poi;
+ HiveDecimalWritable hd = hdoi.getPrimitiveWritableObject(objectForField);
+ // assert the precision of decimal record fits into the table definition
+ if (hd != null) {
+ assert (dtype.getPrecision() >= hd.precision());
+ }
+ out.writeDecimal(hd, getDecimalByteNum(precision), scale);
+ return;
+ // Teradata Type: VARBYTE
+ case BINARY:
+ BinaryObjectInspector bnoi = (BinaryObjectInspector) poi;
+ BytesWritable byw = bnoi.getPrimitiveWritableObject(objectForField);
+ out.writeVarByte(byw);
+ return;
+ default:
+ throw new SerDeException("Unrecognized type: " + poi.getPrimitiveCategory());
+ }
+ // Currently, serialization of complex types is not supported
+ case LIST:
+ case MAP:
+ case STRUCT:
+ default:
+ throw new SerDeException("Unrecognized type: " + oi.getCategory());
+ }
+ }
+
+ @Override public SerDeStats getSerDeStats() {
+ // no support for statistics
+ return null;
+ }
+
+ /**
+ * Deserialize an object out of a Writable blob. In most cases, the return
+ * value of this function will be constant since the function will reuse the
+ * returned object. If the client wants to keep a copy of the object, the
+ * client needs to clone the returned value by calling
+ * ObjectInspectorUtils.getStandardObject().
+ *
+ * @param blob
+ * The Writable object containing a serialized object
+ * @return A Java object representing the contents in the blob.
+ */
+ @Override public Object deserialize(Writable blob) throws SerDeException {
+ try {
+ BytesWritable data = (BytesWritable) blob;
+
+ // initialize the data to be the input stream
+ TeradataBinaryDataInputStream in =
+ new TeradataBinaryDataInputStream(new ByteArrayInputStream(data.getBytes(), 0, data.getLength()));
+
+ int numOfByteRead = in.read(inForNull);
+
+ if (inForNull.length != 0 && numOfByteRead != inForNull.length) {
+ throw new EOFException("not enough bytes for one object");
+ }
+
+ boolean isNull;
+ for (int i = 0; i < numCols; i++) {
+ // get if the ith field is null or not
+ isNull = ((inForNull[i / 8] & (128 >> (i % 8))) != 0);
+ row.set(i, deserializeField(in, columnTypes.get(i), row.get(i), isNull));
+ }
+
+ //After deserializing all the fields, the input should be over in which case in.read will return -1
+ if (in.read() != -1) {
+ throw new EOFException("The inputstream has more after we deserialize all the fields - this is unexpected");
+ }
+ } catch (EOFException e) {
+ LOG.warn("Catch thrown exception", e);
+ LOG.warn("This record has been polluted. We have reset all the row fields to be null");
+ for (int i = 0; i < numCols; i++) {
+ row.set(i, null);
+ }
+ } catch (IOException e) {
+ throw new SerDeException(e);
+ } catch (ParseException e) {
+ throw new SerDeException(e);
+ }
+ return row;
+ }
+
+ private Object deserializeField(TeradataBinaryDataInputStream in, TypeInfo type, Object reuse, boolean isNull)
+ throws IOException, ParseException, SerDeException {
+ // isNull:
+ // In the Teradata Binary file, even the field is null (isNull=true),
+ // thd data still has some default values to pad the record.
+ // In this case, you cannot avoid reading the bytes even it is not used.
+ switch (type.getCategory()) {
+ case PRIMITIVE:
+ PrimitiveTypeInfo ptype = (PrimitiveTypeInfo) type;
+ switch (ptype.getPrimitiveCategory()) {
+ case VARCHAR: // Teradata Type: VARCHAR
+ String st = in.readVarchar();
+ if (isNull) {
+ return null;
+ } else {
+ HiveVarcharWritable r = reuse == null ? new HiveVarcharWritable() : (HiveVarcharWritable) reuse;
+ r.set(st, ((VarcharTypeInfo) type).getLength());
+ return r;
+ }
+ case INT: // Teradata Type: INT
+ int i = in.readInt();
+ if (isNull) {
+ return null;
+ } else {
+ IntWritable r = reuse == null ? new IntWritable() : (IntWritable) reuse;
+ r.set(i);
+ return r;
+ }
+ case TIMESTAMP: // Teradata Type: TIMESTAMP
+ Timestamp ts = in.readTimestamp(getTimeStampByteNum(timestampPrecision));
+ if (isNull) {
+ return null;
+ } else {
+ TimestampWritableV2 r = reuse == null ? new TimestampWritableV2() : (TimestampWritableV2) reuse;
+ r.set(ts);
+ return r;
+ }
+ case DOUBLE: // Teradata Type: FLOAT
+ double d = in.readDouble();
+ if (isNull) {
+ return null;
+ } else {
+ DoubleWritable r = reuse == null ? new DoubleWritable() : (DoubleWritable) reuse;
+ r.set(d);
+ return r;
+ }
+ case DATE: // Teradata Type: DATE
+ Date dt = in.readDate();
+ if (isNull) {
+ return null;
+ } else {
+ DateWritableV2 r = reuse == null ? new DateWritableV2() : (DateWritableV2) reuse;
+ r.set(dt);
+ return r;
+ }
+ case BYTE: // Teradata Type: BYTEINT
+ byte bt = in.readByte();
+ if (isNull) {
+ return null;
+ } else {
+ ByteWritable r = reuse == null ? new ByteWritable() : (ByteWritable) reuse;
+ r.set(bt);
+ return r;
+ }
+ case LONG: // Teradata Type: BIGINT
+ long l = in.readLong();
+ if (isNull) {
+ return null;
+ } else {
+ LongWritable r = reuse == null ? new LongWritable() : (LongWritable) reuse;
+ r.set(l);
+ return r;
+ }
+ case CHAR: // Teradata Type: CHAR
+ CharTypeInfo ctype = (CharTypeInfo) type;
+ int length = ctype.getLength();
+ String c = in.readChar(length * getCharByteNum(charCharset));
+ if (isNull) {
+ return null;
+ } else {
+ HiveCharWritable r = reuse == null ? new HiveCharWritable() : (HiveCharWritable) reuse;
+ r.set(c, length);
+ return r;
+ }
+ case DECIMAL: // Teradata Type: DECIMAL
+ DecimalTypeInfo dtype = (DecimalTypeInfo) type;
+ int precision = dtype.precision();
+ int scale = dtype.scale();
+ HiveDecimal hd = in.readDecimal(scale, getDecimalByteNum(precision));
+ if (isNull) {
+ return null;
+ } else {
+ HiveDecimalWritable r = (reuse == null ? new HiveDecimalWritable() : (HiveDecimalWritable) reuse);
+ r.set(hd);
+ return r;
+ }
+ case SHORT: // Teradata Type: SMALLINT
+ short s = in.readShort();
+ if (isNull) {
+ return null;
+ } else {
+ ShortWritable r = reuse == null ? new ShortWritable() : (ShortWritable) reuse;
+ r.set(s);
+ return r;
+ }
+ case BINARY: // Teradata Type: VARBYTE
+ byte[] content = in.readVarbyte();
+ if (isNull) {
+ return null;
+ } else {
+ BytesWritable r = new BytesWritable();
+ r.set(content, 0, content.length);
+ return r;
+ }
+ default:
+ throw new SerDeException("Unrecognized type: " + ptype.getPrimitiveCategory());
+ }
+ // Currently, deserialization of complex types is not supported
+ case LIST:
+ case MAP:
+ case STRUCT:
+ default:
+ throw new SerDeException("Unsupported category: " + type.getCategory());
+ }
+ }
+
+ /**
+ * Get the object inspector that can be used to navigate through the internal
+ * structure of the Object returned from deserialize(...).
+ */
+ @Override public ObjectInspector getObjectInspector() throws SerDeException {
+ return rowOI;
+ }
+
+ private int getTimeStampByteNum(int precision) {
+ if (precision == 0) {
+ return DEFAULT_TIMESTAMP_BYTE_NUM;
+ } else {
+ return precision + 1 + DEFAULT_TIMESTAMP_BYTE_NUM;
+ }
+ }
+
+ private int getCharByteNum(String charset) throws SerDeException {
+ if (!CHARSET_TO_BYTE_NUM.containsKey(charCharset)) {
+ throw new SerDeException(
+ format("%s isn't supported in Teradata Char Charset %s", charCharset, CHARSET_TO_BYTE_NUM.keySet()));
+ } else {
+ return CHARSET_TO_BYTE_NUM.get(charset);
+ }
+ }
+
+ private int getDecimalByteNum(int precision) throws SerDeException {
+ if (precision <= 0) {
+ throw new SerDeException(format("the precision of Decimal should be bigger than 0. %d is illegal", precision));
+ }
+ if (precision <= 2) {
+ return 1;
+ }
+ if (precision <= 4) {
+ return 2;
+ }
+ if (precision <= 9) {
+ return 4;
+ }
+ if (precision <= 18) {
+ return 8;
+ }
+ if (precision <= 38) {
+ return 16;
+ }
+ throw new IllegalArgumentException(
+ format("the precision of Decimal should be smaller than 39. %d is illegal", precision));
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/b8d82844/serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForDate.java
----------------------------------------------------------------------
diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForDate.java b/serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForDate.java
new file mode 100644
index 0000000..af81fe3
--- /dev/null
+++ b/serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForDate.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.serde2.teradata;
+
+import com.google.common.io.BaseEncoding;
+import junit.framework.TestCase;
+import org.apache.hadoop.hive.common.type.Date;
+import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.io.DateWritableV2;
+import org.apache.hadoop.io.BytesWritable;
+import org.junit.Assert;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Properties;
+
+/**
+ * Test the data type DATE for Teradata binary format.
+ */
+public class TestTeradataBinarySerdeForDate extends TestCase {
+
+ private final TeradataBinarySerde serde = new TeradataBinarySerde();
+ private final Properties props = new Properties();
+
+ protected void setUp() throws Exception {
+ props.setProperty(serdeConstants.LIST_COLUMNS, "TD_DATE");
+ props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "date");
+ serde.initialize(null, props);
+ }
+
+ public void testTimestampBefore1900() throws Exception {
+
+ //0060-01-01
+ BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode("00653de7fe"));
+
+ List<Object> row = (List<Object>) serde.deserialize(in);
+ Date ts = ((DateWritableV2) row.get(0)).get();
+ Assert.assertEquals(ts.getYear(), 60);
+ Assert.assertEquals(ts.getMonth(), 1);
+ Assert.assertEquals(ts.getDay(), 1);
+
+ BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector());
+ Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes()));
+ }
+
+ public void testTimestampAfter1900() throws Exception {
+
+ //9999-01-01
+ BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode("0095cfd304"));
+
+ List<Object> row = (List<Object>) serde.deserialize(in);
+ Date ts = ((DateWritableV2) row.get(0)).get();
+ Assert.assertEquals(ts.getYear(), 9999);
+ Assert.assertEquals(ts.getMonth(), 1);
+ Assert.assertEquals(ts.getDay(), 1);
+
+ BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector());
+ Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes()));
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/b8d82844/serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForDecimal.java
----------------------------------------------------------------------
diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForDecimal.java b/serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForDecimal.java
new file mode 100644
index 0000000..6abdd3f
--- /dev/null
+++ b/serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForDecimal.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.serde2.teradata;
+
+import com.google.common.io.BaseEncoding;
+import junit.framework.TestCase;
+import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.junit.Assert;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Properties;
+
+/**
+ * Test the data type DECIMAL for Teradata binary format.
+ */
+public class TestTeradataBinarySerdeForDecimal extends TestCase {
+
+ private final TeradataBinarySerde serde = new TeradataBinarySerde();
+ private final Properties props = new Properties();
+
+ protected void setUp() throws Exception {
+ props.setProperty(serdeConstants.LIST_COLUMNS, "TD_DECIMAL");
+ props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "decimal(9,5)");
+
+ serde.initialize(null, props);
+ }
+
+ public void testPositiveFraction() throws Exception {
+ BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode("0064000000"));
+
+ List<Object> row = (List<Object>) serde.deserialize(in);
+ Assert.assertTrue("0.001".equals(((HiveDecimalWritable) row.get(0)).getHiveDecimal().toString()));
+
+ BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector());
+ Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes()));
+ }
+
+ public void testNegativeFraction() throws Exception {
+ BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode("009cffffff"));
+
+ List<Object> row = (List<Object>) serde.deserialize(in);
+ Assert.assertTrue("-0.001".equals(((HiveDecimalWritable) row.get(0)).getHiveDecimal().toString()));
+
+ BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector());
+ Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes()));
+ }
+
+ public void testPositiveNumber1() throws Exception {
+ BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode("00a0860100"));
+
+ List<Object> row = (List<Object>) serde.deserialize(in);
+ Assert.assertTrue("1".equals(((HiveDecimalWritable) row.get(0)).getHiveDecimal().toString()));
+
+ BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector());
+ Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes()));
+ }
+
+ public void testNegativeNumber1() throws Exception {
+ BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode("006079feff"));
+
+ List<Object> row = (List<Object>) serde.deserialize(in);
+ Assert.assertTrue("-1".equals(((HiveDecimalWritable) row.get(0)).getHiveDecimal().toString()));
+
+ BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector());
+ Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes()));
+ }
+
+ public void testPositiveNumber2() throws Exception {
+ BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode("0080969800"));
+
+ List<Object> row = (List<Object>) serde.deserialize(in);
+ Assert.assertTrue("100".equals(((HiveDecimalWritable) row.get(0)).getHiveDecimal().toString()));
+
+ BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector());
+ Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes()));
+ }
+
+ public void testNegativeNumber2() throws Exception {
+ BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode("000065c4e0"));
+
+ List<Object> row = (List<Object>) serde.deserialize(in);
+ Assert.assertTrue("-5240".equals(((HiveDecimalWritable) row.get(0)).getHiveDecimal().toString()));
+
+ BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector());
+ Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes()));
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/b8d82844/serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForTimeStamp.java
----------------------------------------------------------------------
diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForTimeStamp.java b/serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForTimeStamp.java
new file mode 100644
index 0000000..a6cf2c1
--- /dev/null
+++ b/serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForTimeStamp.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.serde2.teradata;
+
+import com.google.common.io.BaseEncoding;
+import junit.framework.TestCase;
+import org.apache.hadoop.hive.common.type.Timestamp;
+import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.io.TimestampWritableV2;
+import org.apache.hadoop.io.BytesWritable;
+import org.junit.Assert;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Properties;
+
+/**
+ * Test the data type TIMESTAMP for Teradata binary format.
+ */
+public class TestTeradataBinarySerdeForTimeStamp extends TestCase {
+
+ private final TeradataBinarySerde serde = new TeradataBinarySerde();
+ private final Properties props = new Properties();
+
+ protected void setUp() throws Exception {
+ props.setProperty(serdeConstants.LIST_COLUMNS, "TD_TIMESTAMP");
+ props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "timestamp");
+ }
+
+ public void testTimestampPrecision6() throws Exception {
+ props.setProperty(TeradataBinarySerde.TD_TIMESTAMP_PRECISION, "6");
+ serde.initialize(null, props);
+
+ //2012-10-01 12:00:00.110000
+ BytesWritable in = new BytesWritable(
+ BaseEncoding.base16().lowerCase().decode("00323031322d31302d30312031323a30303a30302e313130303030"));
+
+ List<Object> row = (List<Object>) serde.deserialize(in);
+ Timestamp ts = ((TimestampWritableV2) row.get(0)).getTimestamp();
+ Assert.assertEquals(ts.getYear(), 2012);
+ Assert.assertEquals(ts.getMonth(), 10);
+ Assert.assertEquals(ts.getDay(), 1);
+ Assert.assertEquals(ts.getHours(), 12);
+ Assert.assertEquals(ts.getMinutes(), 0);
+ Assert.assertEquals(ts.getSeconds(), 0);
+ Assert.assertEquals(ts.getNanos(), 110000000);
+
+ BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector());
+ Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes()));
+ }
+
+ public void testTimestampPrecision0() throws Exception {
+ props.setProperty(TeradataBinarySerde.TD_TIMESTAMP_PRECISION, "0");
+ serde.initialize(null, props);
+
+ //2012-10-01 12:00:00
+ BytesWritable in =
+ new BytesWritable(BaseEncoding.base16().lowerCase().decode("00323031322d31302d30312031323a30303a3030"));
+
+ List<Object> row = (List<Object>) serde.deserialize(in);
+ Timestamp ts = ((TimestampWritableV2) row.get(0)).getTimestamp();
+ Assert.assertEquals(ts.getYear(), 2012);
+ Assert.assertEquals(ts.getMonth(), 10);
+ Assert.assertEquals(ts.getDay(), 1);
+ Assert.assertEquals(ts.getHours(), 12);
+ Assert.assertEquals(ts.getMinutes(), 0);
+ Assert.assertEquals(ts.getSeconds(), 0);
+ Assert.assertEquals(ts.getNanos(), 0);
+
+ BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector());
+ Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes()));
+ }
+
+ public void testTimestampPrecision3() throws Exception {
+ props.setProperty(TeradataBinarySerde.TD_TIMESTAMP_PRECISION, "3");
+ serde.initialize(null, props);
+
+ //2012-10-01 12:00:00.345
+ BytesWritable in =
+ new BytesWritable(BaseEncoding.base16().lowerCase().decode("00323031322d31302d30312031323a30303a30302e333435"));
+
+ List<Object> row = (List<Object>) serde.deserialize(in);
+ Timestamp ts = ((TimestampWritableV2) row.get(0)).getTimestamp();
+ Assert.assertEquals(ts.getYear(), 2012);
+ Assert.assertEquals(ts.getMonth(), 10);
+ Assert.assertEquals(ts.getDay(), 1);
+ Assert.assertEquals(ts.getHours(), 12);
+ Assert.assertEquals(ts.getMinutes(), 0);
+ Assert.assertEquals(ts.getSeconds(), 0);
+ Assert.assertEquals(ts.getNanos(), 345000000);
+
+ BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector());
+ Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes()));
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/b8d82844/serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeGeneral.java
----------------------------------------------------------------------
diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeGeneral.java b/serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeGeneral.java
new file mode 100644
index 0000000..c50ef70
--- /dev/null
+++ b/serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeGeneral.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.serde2.teradata;
+
+import com.google.common.io.BaseEncoding;
+import junit.framework.TestCase;
+import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.io.ByteWritable;
+import org.apache.hadoop.hive.serde2.io.DateWritableV2;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
+import org.apache.hadoop.hive.serde2.io.ShortWritable;
+import org.apache.hadoop.hive.serde2.io.TimestampWritableV2;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.junit.Assert;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Properties;
+
+/**
+ * Test all the data types supported for Teradata Binary Format.
+ */
+public class TestTeradataBinarySerdeGeneral extends TestCase {
+
+ private final TeradataBinarySerde serde = new TeradataBinarySerde();
+ private final Properties props = new Properties();
+
+ protected void setUp() throws Exception {
+ props.setProperty(serdeConstants.LIST_COLUMNS,
+ "TD_CHAR, TD_VARCHAR, TD_BIGINT, TD_INT, TD_SMALLINT, TD_BYTEINT, "
+ + "TD_FLOAT,TD_DECIMAL,TD_DATE, TD_TIMESTAMP, TD_VARBYTE");
+ props.setProperty(serdeConstants.LIST_COLUMN_TYPES,
+ "char(3),varchar(100),bigint,int,smallint,tinyint,double,decimal(31,30),date,timestamp,binary");
+
+ serde.initialize(null, props);
+ }
+
+ public void testDeserializeAndSerialize() throws Exception {
+ BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode(
+ "00004e6f762020202020201b006120646179203d2031312f31312f31312020202020202020203435ec10000000000000c5feffff"
+ + "7707010000000000002a40ef2b3dab0d14e6531c8908a72700000007b20100313931312d31312d31312031393a32303a32312e34"
+ + "33333230301b00746573743a20202020202020343333322020202020202020333135"));
+
+ List<Object> row = (List<Object>) serde.deserialize(in);
+ Assert.assertEquals("Nov", ((HiveCharWritable) row.get(0)).toString());
+ Assert.assertEquals("a day = 11/11/11 45", ((HiveVarcharWritable) row.get(1)).toString());
+ Assert.assertEquals(4332L, ((LongWritable) row.get(2)).get());
+ Assert.assertEquals(-315, ((IntWritable) row.get(3)).get());
+ Assert.assertEquals((short) 1911, ((ShortWritable) row.get(4)).get());
+ Assert.assertEquals((byte) 1, ((ByteWritable) row.get(5)).get());
+ Assert.assertEquals((double) 13, ((DoubleWritable) row.get(6)).get(), 0);
+ Assert.assertEquals(30, ((HiveDecimalWritable) row.get(7)).getScale());
+ Assert.assertEquals((double) 3.141592653589793238462643383279,
+ ((HiveDecimalWritable) row.get(7)).getHiveDecimal().doubleValue(), 0);
+ Assert.assertEquals("1911-11-11", ((DateWritableV2) row.get(8)).toString());
+ Assert.assertEquals("1911-11-11 19:20:21.4332", ((TimestampWritableV2) row.get(9)).toString());
+ Assert.assertEquals(27, ((BytesWritable) row.get(10)).getLength());
+
+ BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector());
+ Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes()));
+ }
+
+ public void testDeserializeAndSerializeWithNull() throws Exception {
+ //null bitmap: 0160 -> 00000001 01100000, 7th, 9th, 10th is null
+ BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode(
+ "01604d61722020202020201b006120646179203d2031332f30332f303820202020202020202020397ca10000000000004300000"
+ + "0dd0700000000000048834000000000000000000000000000000000443f110020202020202020202020202020202020202020202"
+ + "020202020200000"));
+ List<Object> row = (List<Object>) serde.deserialize(in);
+
+ Assert.assertEquals("Mar", ((HiveCharWritable) row.get(0)).toString());
+ Assert.assertEquals(null, row.get(7));
+ Assert.assertEquals(null, row.get(9));
+ Assert.assertEquals(null, row.get(10));
+
+ BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector());
+ Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes()));
+ }
+
+ public void testDeserializeAndSerializeAllNull() throws Exception {
+ BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode(
+ "ffe0202020202020202020000000000000000000000000000000000000000000000000000000000000000000000000000000000"
+ + "00000000020202020202020202020202020202020202020202020202020200000"));
+ List<Object> row = (List<Object>) serde.deserialize(in);
+
+ Assert.assertEquals(null, row.get(0));
+ Assert.assertEquals(null, row.get(1));
+ Assert.assertEquals(null, row.get(3));
+ Assert.assertEquals(null, row.get(4));
+ Assert.assertEquals(null, row.get(5));
+ Assert.assertEquals(null, row.get(6));
+ Assert.assertEquals(null, row.get(7));
+ Assert.assertEquals(null, row.get(8));
+ Assert.assertEquals(null, row.get(9));
+ Assert.assertEquals(null, row.get(10));
+
+ BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector());
+ Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes()));
+ }
+
+ public void testDeserializeCorruptedRecord() throws Exception {
+ BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode(
+ "00004e6f762020202020201b006120646179203d2031312f31312f31312020202020202020203435ec10000000000000c5feff"
+ + "ff7707010000000000002a40ef2b3dab0d14e6531c8908a72700000007b20100313931312d31312d31312031393a32303a32312"
+ + "e3433333230301b00746573743a20202020202020343333322020202020202020333135ff"));
+
+ List<Object> row = (List<Object>) serde.deserialize(in);
+ Assert.assertEquals(null, row.get(0));
+ Assert.assertEquals(null, row.get(3));
+ Assert.assertEquals(null, row.get(10));
+ }
+}
[2/2] hive git commit: HIVE-20225: SerDe to support Teradata Binary
Format (Lu Li via cws)
Posted by cw...@apache.org.
HIVE-20225: SerDe to support Teradata Binary Format (Lu Li via cws)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/b8d82844
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/b8d82844
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/b8d82844
Branch: refs/heads/master
Commit: b8d82844b9743d7a35dcc7fe6c702486fc4a9d72
Parents: cf5486d
Author: Carl Steinbach <cw...@apache.org>
Authored: Wed Aug 29 13:30:34 2018 -0700
Committer: Carl Steinbach <cw...@apache.org>
Committed: Wed Aug 29 13:50:56 2018 -0700
----------------------------------------------------------------------
.../ql/io/TeradataBinaryFileInputFormat.java | 66 ++
.../ql/io/TeradataBinaryFileOutputFormat.java | 112 ++++
.../hive/ql/io/TeradataBinaryRecordReader.java | 280 +++++++++
.../clientpositive/test_teradatabinaryfile.q | 123 ++++
.../test_teradatabinaryfile.q.out | 537 +++++++++++++++++
.../teradata/TeradataBinaryDataInputStream.java | 199 +++++++
.../TeradataBinaryDataOutputStream.java | 270 +++++++++
.../serde2/teradata/TeradataBinarySerde.java | 597 +++++++++++++++++++
.../TestTeradataBinarySerdeForDate.java | 76 +++
.../TestTeradataBinarySerdeForDecimal.java | 106 ++++
.../TestTeradataBinarySerdeForTimeStamp.java | 111 ++++
.../TestTeradataBinarySerdeGeneral.java | 133 +++++
12 files changed, 2610 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/b8d82844/ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryFileInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryFileInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryFileInputFormat.java
new file mode 100644
index 0000000..bed87c5
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryFileInputFormat.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.io;
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.mapred.Reporter;
+
+/**
+ * https://cwiki.apache.org/confluence/display/Hive/TeradataBinarySerde.
+ * FileInputFormat for Teradata binary files.
+ *
+ * In the Teradata Binary File, each record constructs as below:
+ * The first 2 bytes represents the length of the bytes next for this record.
+ * Then the null bitmap whose length is depended on the number of fields is followed.
+ * Then each field of the record is serialized into bytes - the serialization strategy is decided by the type of field.
+ * At last, there is one byte (0x0a) in the end of the record.
+ *
+ * This InputFormat currently doesn't support the split of the file.
+ * Teradata binary files are using little endian.
+ */
+public class TeradataBinaryFileInputFormat extends FileInputFormat<NullWritable, BytesWritable> {
+
+ @Override public RecordReader<NullWritable, BytesWritable> getRecordReader(InputSplit split, JobConf job,
+ Reporter reporter) throws IOException {
+ reporter.setStatus(split.toString());
+ return new TeradataBinaryRecordReader(job, (FileSplit) split);
+ }
+
+ /**
+ * the <code>TeradataBinaryFileInputFormat</code> is not splittable right now.
+ * Override the <code>isSplitable</code> function.
+ *
+ * @param fs the file system that the file is on
+ * @param filename the file name to check
+ * @return is this file splitable?
+ */
+ @Override protected boolean isSplitable(FileSystem fs, Path filename) {
+ return false;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/b8d82844/ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryFileOutputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryFileOutputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryFileOutputFormat.java
new file mode 100644
index 0000000..0469825
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryFileOutputFormat.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.io;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.Properties;
+
+import org.apache.commons.io.EndianUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.util.Progressable;
+
+import static java.lang.String.format;
+
+/**
+ * https://cwiki.apache.org/confluence/display/Hive/TeradataBinarySerde.
+ * FileOutputFormat for Teradata binary files.
+ *
+ * In the Teradata Binary File, each record constructs as below:
+ * The first 2 bytes represents the length of the bytes next for this record (null bitmap and fields).
+ * Then the null bitmap whose length is depended on the number of fields is followe.
+ * Then each field of the record is serialized into bytes - the serialization strategy is decided by the type of field.
+ * At last, there is one byte (0x0a) in the end of the record.
+ *
+ * Teradata binary files are using little endian.
+ */
+public class TeradataBinaryFileOutputFormat<K extends WritableComparable, V extends Writable>
+ extends HiveIgnoreKeyTextOutputFormat<K, V> {
+ private static final Log LOG = LogFactory.getLog(TeradataBinaryFileOutputFormat.class);
+
+ static final byte RECORD_END_BYTE = (byte) 0x0a;
+
+ /**
+ * create the final out file, and output row by row. After one row is
+ * appended, a configured row separator is appended
+ *
+ * @param jc
+ * the job configuration file
+ * @param outPath
+ * the final output file to be created
+ * @param valueClass
+ * the value class used for create
+ * @param isCompressed
+ * whether the content is compressed or not
+ * @param tableProperties
+ * the tableProperties of this file's corresponding table
+ * @param progress
+ * progress used for status report
+ * @return the RecordWriter
+ */
+ @Override public RecordWriter getHiveRecordWriter(JobConf jc, Path outPath, Class<? extends Writable> valueClass,
+ boolean isCompressed, Properties tableProperties, Progressable progress) throws IOException {
+ FileSystem fs = outPath.getFileSystem(jc);
+ final OutputStream outStream = Utilities.createCompressedStream(jc, fs.create(outPath, progress), isCompressed);
+ return new RecordWriter() {
+ @Override public void write(Writable r) throws IOException {
+ BytesWritable bw = (BytesWritable) r;
+ int recordLength = bw.getLength();
+
+ //Based on the row length to decide if the length is int or short
+ String rowLength = tableProperties
+ .getProperty(TeradataBinaryRecordReader.TD_ROW_LENGTH, TeradataBinaryRecordReader.DEFAULT_TD_ROW_LENGTH)
+ .toLowerCase();
+ LOG.debug(format("The table property %s is: %s", TeradataBinaryRecordReader.TD_ROW_LENGTH, rowLength));
+
+ if (TeradataBinaryRecordReader.TD_ROW_LENGTH_TO_BYTE_NUM.containsKey(rowLength)) {
+ if (rowLength.equals(TeradataBinaryRecordReader.DEFAULT_TD_ROW_LENGTH)) {
+ EndianUtils.writeSwappedShort(outStream, (short) recordLength); // write the length using little endian
+ } else if (rowLength.equals(TeradataBinaryRecordReader.TD_ROW_LENGTH_1MB)) {
+ EndianUtils.writeSwappedInteger(outStream, recordLength); // write the length using little endian
+ }
+ } else {
+ throw new IllegalArgumentException(format("%s doesn't support the value %s, the supported values are %s",
+ TeradataBinaryRecordReader.TD_ROW_LENGTH, rowLength,
+ TeradataBinaryRecordReader.TD_ROW_LENGTH_TO_BYTE_NUM.keySet()));
+ }
+
+ outStream.write(bw.getBytes(), 0, bw.getLength()); // write the content (the content is in little endian)
+ outStream.write(RECORD_END_BYTE); //write the record ending
+ }
+
+ @Override public void close(boolean abort) throws IOException {
+ outStream.close();
+ }
+ };
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/b8d82844/ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryRecordReader.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryRecordReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryRecordReader.java
new file mode 100644
index 0000000..337b5d2
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryRecordReader.java
@@ -0,0 +1,280 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.io;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map;
+
+import com.google.common.collect.ImmutableMap;
+import org.apache.commons.codec.binary.Hex;
+import org.apache.commons.io.EndianUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.Seekable;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.plan.PartitionDesc;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordReader;
+
+import static java.lang.String.format;
+
+/**
+ * The TeradataBinaryRecordReader reads the record from Teradata binary files.
+ *
+ * In the Teradata Binary File, each record constructs as below:
+ * The first 2 bytes represents the length of the bytes next for this record.
+ * Then the null bitmap whose length is depended on the number of fields is followed.
+ * Then each field of the record is serialized into bytes - the serialization strategy is decided by the type of field.
+ * At last, there is one byte (0x0a) in the end of the record.
+ *
+ * This InputFormat currently doesn't support the split of the file.
+ * Teradata binary files are using little endian.
+ */
+public class TeradataBinaryRecordReader implements RecordReader<NullWritable, BytesWritable> {
+
+ private static final Log LOG = LogFactory.getLog(TeradataBinaryRecordReader.class);
+
+ private CompressionCodecFactory compressionCodecs = null;
+ private InputStream in;
+ private long start;
+ private long pos;
+ private long end;
+ private final Seekable filePosition;
+ private CompressionCodec codec;
+
+ static final String TD_ROW_LENGTH = "teradata.row.length";
+ static final Map<String, Integer> TD_ROW_LENGTH_TO_BYTE_NUM = ImmutableMap.of("64kb", 2, "1mb", 4);
+ static final String DEFAULT_TD_ROW_LENGTH = "64kb";
+ static final String TD_ROW_LENGTH_1MB = "1mb";
+
+ private byte[] recordLengthBytes;
+ private byte[] valueByteArray = new byte[65536]; // max byte array
+ private byte[] endOfRecord = new byte[1];
+
+ private int recordLength = 0;
+
+ public TeradataBinaryRecordReader(JobConf job, FileSplit fileSplit) throws IOException {
+ LOG.debug("initialize the TeradataBinaryRecordReader");
+
+ String rowLength = job.get(TD_ROW_LENGTH);
+ if (rowLength == null) {
+ LOG.debug("No table property in JobConf. Try to recover the table directly");
+ Map<String, PartitionDesc> partitionDescMap = Utilities.getMapRedWork(job).getMapWork().getAliasToPartnInfo();
+ for (String alias : Utilities.getMapRedWork(job).getMapWork().getAliasToPartnInfo().keySet()) {
+ LOG.debug(format("the current alias: %s", alias));
+ rowLength = partitionDescMap.get(alias).getTableDesc().getProperties().getProperty(TD_ROW_LENGTH);
+ if (rowLength != null) {
+ break;
+ }
+ }
+ }
+
+ if (rowLength == null) {
+ rowLength = DEFAULT_TD_ROW_LENGTH;
+ } else {
+ rowLength = rowLength.toLowerCase();
+ }
+
+ if (TD_ROW_LENGTH_TO_BYTE_NUM.containsKey(rowLength)) {
+ recordLengthBytes = new byte[TD_ROW_LENGTH_TO_BYTE_NUM.get(rowLength)];
+ } else {
+ throw new IllegalArgumentException(
+ format("%s doesn't support the value %s, the supported values are %s", TD_ROW_LENGTH, rowLength,
+ TD_ROW_LENGTH_TO_BYTE_NUM.keySet()));
+ }
+
+ start = fileSplit.getStart();
+ end = start + fileSplit.getLength();
+
+ LOG.debug(format("The start of the file split is: %s", start));
+ LOG.debug(format("The end of the file split is: %s", end));
+
+ final Path file = fileSplit.getPath();
+ compressionCodecs = new CompressionCodecFactory(job);
+ codec = compressionCodecs.getCodec(file);
+ FileSystem fs = file.getFileSystem(job);
+ FSDataInputStream fileIn = fs.open(fileSplit.getPath());
+
+ /* currently the TeradataBinaryRecord file doesn't support file split at all */
+ filePosition = fileIn;
+ if (isCompressedInput()) {
+ LOG.info(format("Input file is compressed. Using compression code %s", codec.getClass().getName()));
+ in = codec.createInputStream(fileIn);
+ } else {
+ LOG.info("The input file is not compressed");
+ in = fileIn;
+ }
+ pos = start;
+ }
+
+ /**
+ * Reads the next key/value pair from the input for processing.
+ *
+ * @param key the key to read data into
+ * @param value the value to read data into
+ * @return true iff a key/value was read, false if at EOF
+ */
+ @Override public synchronized boolean next(NullWritable key, BytesWritable value) throws IOException {
+
+ /* read the record length */
+ int lengthExpected = recordLengthBytes.length;
+ int hasConsumed = readExpectedBytes(recordLengthBytes, lengthExpected);
+ if (hasConsumed == 0) {
+ LOG.info("Reach the End of File. No more record");
+ return false;
+ } else if (hasConsumed < lengthExpected) {
+ LOG.error(
+ format("We expect %s bytes for the record length but read %d byte and reach the End of File.", lengthExpected,
+ hasConsumed));
+ LOG.error(format("The current position in the file : %s", getFilePosition()));
+ LOG.error(format("The current consumed bytes: %s", pos));
+ LOG.error(format("The bytes for the current record is: %s", Hex.encodeHexString(recordLengthBytes)));
+ throw new EOFException("When reading the record length, reach the unexpected end of file.");
+ }
+ /* get the record contend length to prepare to read the content */
+ recordLength = EndianUtils.readSwappedUnsignedShort(recordLengthBytes, 0);
+ pos += lengthExpected;
+
+ /* read the record content */
+ lengthExpected = recordLength;
+ hasConsumed = readExpectedBytes(valueByteArray, lengthExpected);
+ if (hasConsumed < lengthExpected) {
+ LOG.error(format("We expect %s bytes for the record content but read %d byte and reach the End of File.",
+ lengthExpected, hasConsumed));
+ LOG.error(format("The current position in the file : %s", getFilePosition()));
+ LOG.error(format("The current consumed bytes: %s", pos));
+ LOG.error(format("The bytes for the current record is: %s",
+ Hex.encodeHexString(recordLengthBytes) + Hex.encodeHexString(valueByteArray)));
+ throw new EOFException("When reading the contend of the record, reach the unexpected end of file.");
+ }
+ value.set(valueByteArray, 0, recordLength);
+ pos += lengthExpected;
+
+ /* read the record end */
+ lengthExpected = endOfRecord.length;
+ hasConsumed = readExpectedBytes(endOfRecord, lengthExpected);
+ if (hasConsumed < lengthExpected) {
+ LOG.error(format("We expect %s bytes for the record end symbol but read %d byte and reach the End of File.",
+ lengthExpected, hasConsumed));
+ LOG.error(format("The current position in the file : %s", getFilePosition()));
+ LOG.error(format("The current consumed bytes: %s", pos));
+ LOG.error(format("The bytes for the current record is: %s",
+ Hex.encodeHexString(recordLengthBytes) + Hex.encodeHexString(valueByteArray) + Hex
+ .encodeHexString(endOfRecord)));
+ throw new EOFException("When reading the end of record, reach the unexpected end of file.");
+ }
+
+ if (endOfRecord[0] != TeradataBinaryFileOutputFormat.RECORD_END_BYTE) {
+ throw new IOException(format("We expect 0x0a as the record end but get %s.", Hex.encodeHexString(endOfRecord)));
+ }
+ pos += lengthExpected;
+
+ return true;
+ }
+
+ /**
+ * Create an object of the appropriate type to be used as a key.
+ *
+ * @return a new key object.
+ */
+ @Override public NullWritable createKey() {
+ return NullWritable.get();
+ }
+
+ /**
+ * Create an object of the appropriate type to be used as a value.
+ *
+ * @return a new value object.
+ */
+ @Override public BytesWritable createValue() {
+ return new BytesWritable();
+ }
+
+ /**
+ * Returns the current position in the input.
+ *
+ * @return the current position in the input.
+ * @throws IOException
+ */
+ @Override public long getPos() throws IOException {
+ return pos;
+ }
+
+ /**
+ *
+ * @throws IOException
+ */
+ @Override public void close() throws IOException {
+ if (in != null) {
+ in.close();
+ }
+ }
+
+ /**
+ * How much of the input has the {@link RecordReader} consumed i.e.
+ * has been processed by?
+ *
+ * @return progress from <code>0.0</code> to <code>1.0</code>.
+ * @throws IOException
+ */
+ @Override public float getProgress() throws IOException {
+ if (start == end) {
+ return 0.0F;
+ } else {
+ return Math.min(1.0F, (float) (getFilePosition() - start) / (float) (end - start));
+ }
+ }
+
+ private boolean isCompressedInput() {
+ return codec != null;
+ }
+
+ private synchronized long getFilePosition() throws IOException {
+ long retVal;
+ if (isCompressedInput() && filePosition != null) {
+ retVal = filePosition.getPos();
+ } else {
+ retVal = getPos();
+ }
+ return retVal;
+ }
+
+ private synchronized int readExpectedBytes(byte[] toWrite, int lengthExpected) throws IOException {
+ int curPos = 0;
+ do {
+ int numOfByteRead = in.read(toWrite, curPos, lengthExpected - curPos);
+ if (numOfByteRead < 0) {
+ return curPos;
+ } else {
+ curPos += numOfByteRead;
+ }
+ } while (curPos < lengthExpected);
+ return curPos;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/b8d82844/ql/src/test/queries/clientpositive/test_teradatabinaryfile.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/test_teradatabinaryfile.q b/ql/src/test/queries/clientpositive/test_teradatabinaryfile.q
new file mode 100644
index 0000000..33ab677
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/test_teradatabinaryfile.q
@@ -0,0 +1,123 @@
+DROP TABLE if exists teradata_binary_table_64kb;
+DROP TABLE if exists teradata_binary_table_1mb;
+DROP TABLE if exists teradata_binary_table_64kb_insert;
+DROP TABLE if exists teradata_binary_table_1mb_insert;
+
+
+CREATE TABLE `teradata_binary_table_64kb`(
+ `test_tinyint` tinyint,
+ `test_smallint` smallint,
+ `test_int` int,
+ `test_bigint` bigint,
+ `test_double` double,
+ `test_decimal` decimal(15,2),
+ `test_date` date,
+ `test_timestamp` timestamp,
+ `test_char` char(1),
+ `test_varchar` varchar(40),
+ `test_binary` binary
+ )
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat'
+TBLPROPERTIES (
+ 'teradata.timestamp.precision'='0',
+ 'teradata.char.charset'='LATIN',
+ 'teradata.row.length'='64KB'
+);
+
+CREATE TABLE `teradata_binary_table_1mb`(
+ `test_tinyint` tinyint,
+ `test_smallint` smallint,
+ `test_int` int,
+ `test_bigint` bigint,
+ `test_double` double,
+ `test_decimal` decimal(15,2),
+ `test_date` date,
+ `test_timestamp` timestamp,
+ `test_char` char(1),
+ `test_varchar` varchar(40),
+ `test_binary` binary
+ )
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat'
+TBLPROPERTIES (
+ 'teradata.timestamp.precision'='6',
+ 'teradata.char.charset'='UNICODE',
+ 'teradata.row.length'='1MB'
+);
+
+CREATE TABLE `teradata_binary_table_64kb_insert`(
+ `test_tinyint` tinyint,
+ `test_decimal` decimal(15,2),
+ `test_date` date,
+ `test_timestamp` timestamp
+ )
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat'
+TBLPROPERTIES (
+ 'teradata.timestamp.precision'='0',
+ 'teradata.char.charset'='LATIN',
+ 'teradata.row.length'='64KB'
+);
+
+CREATE TABLE `teradata_binary_table_1mb_insert`(
+ `test_tinyint` tinyint,
+ `test_int` int
+ )
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat'
+TBLPROPERTIES (
+ 'teradata.timestamp.precision'='6',
+ 'teradata.char.charset'='UNICODE',
+ 'teradata.row.length'='1MB'
+);
+
+LOAD DATA LOCAL INPATH '../../data/files/teradata_binary_file/teradata_binary_table.deflate' OVERWRITE INTO TABLE teradata_binary_table_64kb;
+LOAD DATA LOCAL INPATH '../../data/files/teradata_binary_file/td_data_with_1mb_rowsize.teradata.gz' OVERWRITE INTO TABLE teradata_binary_table_1mb;
+
+SELECT * from teradata_binary_table_64kb;
+SELECT * from teradata_binary_table_1mb;
+
+SELECT COUNT(*) FROM teradata_binary_table_64kb;
+SELECT COUNT(*) FROM teradata_binary_table_1mb;
+
+SELECT max(date_format(test_timestamp, 'y')) FROM teradata_binary_table_64kb;
+SELECT max(date_format(test_date, 'y')) FROM teradata_binary_table_64kb;
+SELECT max(Floor(test_decimal)) FROM teradata_binary_table_64kb;
+
+SELECT max(date_format(test_timestamp, 'y')) FROM teradata_binary_table_1mb;
+SELECT max(date_format(test_date, 'y')) FROM teradata_binary_table_1mb;
+SELECT max(Floor(test_decimal)) FROM teradata_binary_table_1mb;
+
+SELECT test_tinyint, MAX(test_decimal) FROM teradata_binary_table_64kb GROUP BY test_tinyint;
+SELECT test_tinyint, MAX(test_decimal) FROM teradata_binary_table_1mb GROUP BY test_tinyint;
+
+INSERT OVERWRITE TABLE teradata_binary_table_64kb_insert
+SELECT test_tinyint, test_decimal, test_date, test_timestamp FROM teradata_binary_table_64kb;
+
+INSERT OVERWRITE TABLE teradata_binary_table_1mb_insert
+SELECT 1, 15;
+
+DESC FORMATTED teradata_binary_table_64kb_insert;
+DESC FORMATTED teradata_binary_table_1mb_insert;
+
+DROP TABLE if exists teradata_binary_table_64kb;
+DROP TABLE if exists teradata_binary_table_1mb;
+DROP TABLE if exists teradata_binary_table_64kb_insert;
+DROP TABLE if exists teradata_binary_table_1mb_insert;
http://git-wip-us.apache.org/repos/asf/hive/blob/b8d82844/ql/src/test/results/clientpositive/test_teradatabinaryfile.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/test_teradatabinaryfile.q.out b/ql/src/test/results/clientpositive/test_teradatabinaryfile.q.out
new file mode 100644
index 0000000..9db1372
--- /dev/null
+++ b/ql/src/test/results/clientpositive/test_teradatabinaryfile.q.out
@@ -0,0 +1,537 @@
+PREHOOK: query: DROP TABLE if exists teradata_binary_table_64kb
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE if exists teradata_binary_table_64kb
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: DROP TABLE if exists teradata_binary_table_1mb
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE if exists teradata_binary_table_1mb
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: DROP TABLE if exists teradata_binary_table_64kb_insert
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE if exists teradata_binary_table_64kb_insert
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: DROP TABLE if exists teradata_binary_table_1mb_insert
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE if exists teradata_binary_table_1mb_insert
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE `teradata_binary_table_64kb`(
+ `test_tinyint` tinyint,
+ `test_smallint` smallint,
+ `test_int` int,
+ `test_bigint` bigint,
+ `test_double` double,
+ `test_decimal` decimal(15,2),
+ `test_date` date,
+ `test_timestamp` timestamp,
+ `test_char` char(1),
+ `test_varchar` varchar(40),
+ `test_binary` binary
+ )
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat'
+TBLPROPERTIES (
+ 'teradata.timestamp.precision'='0',
+ 'teradata.char.charset'='LATIN',
+ 'teradata.row.length'='64KB'
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@teradata_binary_table_64kb
+POSTHOOK: query: CREATE TABLE `teradata_binary_table_64kb`(
+ `test_tinyint` tinyint,
+ `test_smallint` smallint,
+ `test_int` int,
+ `test_bigint` bigint,
+ `test_double` double,
+ `test_decimal` decimal(15,2),
+ `test_date` date,
+ `test_timestamp` timestamp,
+ `test_char` char(1),
+ `test_varchar` varchar(40),
+ `test_binary` binary
+ )
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat'
+TBLPROPERTIES (
+ 'teradata.timestamp.precision'='0',
+ 'teradata.char.charset'='LATIN',
+ 'teradata.row.length'='64KB'
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@teradata_binary_table_64kb
+PREHOOK: query: CREATE TABLE `teradata_binary_table_1mb`(
+ `test_tinyint` tinyint,
+ `test_smallint` smallint,
+ `test_int` int,
+ `test_bigint` bigint,
+ `test_double` double,
+ `test_decimal` decimal(15,2),
+ `test_date` date,
+ `test_timestamp` timestamp,
+ `test_char` char(1),
+ `test_varchar` varchar(40),
+ `test_binary` binary
+ )
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat'
+TBLPROPERTIES (
+ 'teradata.timestamp.precision'='6',
+ 'teradata.char.charset'='UNICODE',
+ 'teradata.row.length'='1MB'
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@teradata_binary_table_1mb
+POSTHOOK: query: CREATE TABLE `teradata_binary_table_1mb`(
+ `test_tinyint` tinyint,
+ `test_smallint` smallint,
+ `test_int` int,
+ `test_bigint` bigint,
+ `test_double` double,
+ `test_decimal` decimal(15,2),
+ `test_date` date,
+ `test_timestamp` timestamp,
+ `test_char` char(1),
+ `test_varchar` varchar(40),
+ `test_binary` binary
+ )
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat'
+TBLPROPERTIES (
+ 'teradata.timestamp.precision'='6',
+ 'teradata.char.charset'='UNICODE',
+ 'teradata.row.length'='1MB'
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@teradata_binary_table_1mb
+PREHOOK: query: CREATE TABLE `teradata_binary_table_64kb_insert`(
+ `test_tinyint` tinyint,
+ `test_decimal` decimal(15,2),
+ `test_date` date,
+ `test_timestamp` timestamp
+ )
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat'
+TBLPROPERTIES (
+ 'teradata.timestamp.precision'='0',
+ 'teradata.char.charset'='LATIN',
+ 'teradata.row.length'='64KB'
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@teradata_binary_table_64kb_insert
+POSTHOOK: query: CREATE TABLE `teradata_binary_table_64kb_insert`(
+ `test_tinyint` tinyint,
+ `test_decimal` decimal(15,2),
+ `test_date` date,
+ `test_timestamp` timestamp
+ )
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat'
+TBLPROPERTIES (
+ 'teradata.timestamp.precision'='0',
+ 'teradata.char.charset'='LATIN',
+ 'teradata.row.length'='64KB'
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@teradata_binary_table_64kb_insert
+PREHOOK: query: CREATE TABLE `teradata_binary_table_1mb_insert`(
+ `test_tinyint` tinyint,
+ `test_int` int
+ )
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat'
+TBLPROPERTIES (
+ 'teradata.timestamp.precision'='6',
+ 'teradata.char.charset'='UNICODE',
+ 'teradata.row.length'='1MB'
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@teradata_binary_table_1mb_insert
+POSTHOOK: query: CREATE TABLE `teradata_binary_table_1mb_insert`(
+ `test_tinyint` tinyint,
+ `test_int` int
+ )
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat'
+TBLPROPERTIES (
+ 'teradata.timestamp.precision'='6',
+ 'teradata.char.charset'='UNICODE',
+ 'teradata.row.length'='1MB'
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@teradata_binary_table_1mb_insert
+PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/teradata_binary_file/teradata_binary_table.deflate' OVERWRITE INTO TABLE teradata_binary_table_64kb
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@teradata_binary_table_64kb
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/teradata_binary_file/teradata_binary_table.deflate' OVERWRITE INTO TABLE teradata_binary_table_64kb
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@teradata_binary_table_64kb
+PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/teradata_binary_file/td_data_with_1mb_rowsize.teradata.gz' OVERWRITE INTO TABLE teradata_binary_table_1mb
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@teradata_binary_table_1mb
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/teradata_binary_file/td_data_with_1mb_rowsize.teradata.gz' OVERWRITE INTO TABLE teradata_binary_table_1mb
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@teradata_binary_table_1mb
+PREHOOK: query: SELECT * from teradata_binary_table_64kb
+PREHOOK: type: QUERY
+PREHOOK: Input: default@teradata_binary_table_64kb
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * from teradata_binary_table_64kb
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@teradata_binary_table_64kb
+#### A masked pattern was here ####
+10 34 139997714 32307660 18.6717 59.99 2018-08-23 2018-07-23 01:45:55 A NULL NULL
+10 28 89082024 53367308 5.9069 27.90 2018-08-23 2018-07-23 19:45:36 A NULL NULL
+10 31 65499801 9495835 5.9064 29.99 2018-08-23 2018-07-23 09:15:10 A NULL NULL
+10 20 144923884 123337561 20.1037 50.50 2018-08-23 2018-07-23 22:49:52 A NULL NULL
+10 9 118474716 110462827 18.6697 29.99 2018-08-23 2018-07-23 10:13:03 A NULL NULL
+10 4 116098596 555556155 20.1017 29.99 2018-07-23 2018-07-23 13:12:10 X SELF_SERVICE SELF_SERVICE
+10 10 84492975 100052093 15.4913 29.99 2018-08-23 2018-07-23 17:56:32 A NULL NULL
+10 31 101314613 45413087 5.9064 29.99 2018-08-23 2018-07-23 11:26:24 A NULL NULL
+10 1 156962113 554297748 NULL 29.99 2018-08-23 2018-07-23 11:31:31 A NULL NULL
+10 10 92560875 380929783 20.1011 20.91 2018-07-30 2018-07-23 05:02:42 S RCHARGE_FAILURE RCHARGE_FAILURE
+10 5 154490193 186062438 20.1037 29.99 2018-07-23 2018-07-23 10:17:20 X NULL NULL
+10 31 2954435 34009387 0.0214 24.23 2018-08-23 2018-07-23 15:46:21 A NULL NULL
+10 4 156942563 55362740 0.0024 29.99 2018-08-23 2018-07-23 08:16:49 A NULL NULL
+10 31 90527523 126581551 7.5689 59.99 2018-08-23 2018-07-23 03:40:28 A NULL NULL
+10 1 118477496 598803186 NULL 29.99 2018-08-23 2018-07-23 10:45:28 A NULL NULL
+10 75 137653654 38440942 20.1037 29.99 2018-08-23 2018-07-23 19:01:04 A NULL NULL
+10 2 142697304 106829658 20.1008 24.21 2018-07-23 2018-07-23 05:22:17 S RCHARGE_FAILURE RCHARGE_FAILURE
+10 14 134043823 264156349 20.1008 24.21 2018-08-23 2018-07-23 12:12:48 A NULL NULL
+10 7 91359485 7008957 20.1011 20.91 2018-08-23 2018-07-23 23:42:04 A NULL NULL
+10 1 118512426 222159750 NULL 29.99 2018-08-23 2018-07-23 17:06:25 A NULL NULL
+10 5 155168873 135968937 18.6697 59.99 2018-07-30 2018-07-23 18:01:35 S RCHARGE_FAILURE RCHARGE_FAILURE
+10 4 151084943 38355275 20.1017 29.99 2018-08-23 2018-07-23 04:12:32 A NULL NULL
+10 6 118452556 90264779 20.1017 59.99 2018-08-23 2018-07-23 05:18:44 A NULL NULL
+10 31 53127101 18622653 0.0115 49.95 2018-08-23 2018-07-23 07:38:05 A NULL NULL
+10 1 118479736 216825119 NULL 29.99 2018-08-23 2018-07-23 11:11:51 A NULL NULL
+10 4 142708764 21984202 30.5785 27.50 2018-08-23 2018-07-23 10:36:22 A NULL NULL
+10 4 142713364 33598449 20.1017 29.99 2018-07-23 2018-07-23 12:49:24 X SELF_SERVICE SELF_SERVICE
+10 22 103578546 152144452 20.1017 29.99 2018-08-23 2018-07-23 11:18:44 A NULL NULL
+10 22 111233194 69051 20.1017 29.99 2018-08-23 2018-07-23 08:58:16 A NULL NULL
+10 12 132376034 2651098 20.1017 29.99 2018-08-23 2018-07-23 06:01:44 A NULL NULL
+10 11 135778714 29866847 18.6717 59.99 2018-08-23 2018-07-23 02:35:58 A NULL NULL
+10 10 118525066 34556421 5.9064 29.99 2018-08-23 2018-07-23 21:15:29 A NULL NULL
+10 7 144897784 532208226 20.1017 29.99 2018-08-23 2018-07-23 14:35:42 A NULL NULL
+10 34 87091713 93626084 5.9064 29.99 2018-08-23 2018-07-23 08:56:25 A NULL NULL
+10 21 129323704 14298869 30.5516 55.03 2018-08-23 2018-07-23 05:48:14 A NULL NULL
+10 31 112813163 36762074 5.9064 29.99 2018-08-23 2018-07-23 18:07:23 A NULL NULL
+10 1 156980833 58308375 NULL 59.99 2018-08-23 2018-07-23 14:54:17 A NULL NULL
+10 5 150357953 101207194 20.1017 29.99 2018-08-14 2018-07-23 13:53:14 S NULL NULL
+10 1 118462836 668498576 NULL 55.03 2018-08-23 2018-07-23 07:44:11 A NULL NULL
+10 7 129423664 312394041 20.1017 29.99 2018-08-23 2018-07-23 20:40:42 A NULL NULL
+10 10 122518074 5448199 20.1017 29.99 2018-08-23 2018-07-23 01:30:03 A NULL NULL
+10 3 113469566 593079639 20.1037 29.99 2018-08-23 2018-07-23 19:39:05 A NULL NULL
+10 4 144878314 88960410 18.6689 55.03 2018-08-23 2018-07-23 11:43:56 A NULL NULL
+10 8 146831593 168164335 30.5786 28.03 2018-08-23 2018-07-23 11:34:36 A NULL NULL
+10 4 91358385 23752815 29.9896 27.21 2018-08-23 2018-07-23 23:20:30 A NULL NULL
+10 3 118533306 286487393 30.5529 44.02 2019-07-23 2018-07-23 23:48:14 A NULL NULL
+10 7 103618686 339052539 18.6697 59.99 2018-08-23 2018-07-23 18:26:54 A NULL NULL
+10 11 92556375 196464425 29.9896 27.21 2018-08-23 2018-07-23 03:15:07 A NULL NULL
+10 11 137563254 239883707 18.6697 59.99 2018-08-23 2018-07-23 02:01:31 A NULL NULL
+10 2 116078336 61997052 20.1017 29.99 2018-07-23 2018-07-23 00:55:05 X SELF_SERVICE SELF_SERVICE
+PREHOOK: query: SELECT * from teradata_binary_table_1mb
+PREHOOK: type: QUERY
+PREHOOK: Input: default@teradata_binary_table_1mb
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * from teradata_binary_table_1mb
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@teradata_binary_table_1mb
+#### A masked pattern was here ####
+-6 0 -99999 -1 NULL 0.00 2011-01-02 2009-02-28 12:34:56 数 AABBCC
+5 3200 -9999 NULL 3.14159 314000000.00 NULL 2011-02-28 12:34:56 ABC NULL
+-127 32000 -9 1234567890123456789 2.01E10 3.14 2011-01-02 2022-02-28 12:34:56 数 ありがとうございます �7��c�
+-1 -32000 0 123456789012345678 2.0108E10 314.15 0001-12-31 NULL A thank you �7��c�
+127 32767 1 999000 2.034E12 0.04 2099-01-02 NULL I �7��c�
+2 -32767 9 987654321098765432 2.019876E12 NULL 2011-01-02 NULL あ test NULL
+3 32 99 -1234567890123456789 2.0E12 3140000000000.00 2999-12-31 0001-12-28 12:34:56 ? *** �7��c�
+-127 32000 100 1234567890123456789 2.01E10 3.14 2011-01-02 2022-02-28 12:34:56 数 ありがとうございます �7��c�
+-1 -32000 101 123456789012345678 2.0108E10 314.15 2009-09-09 NULL A thank you �7��c�
+127 32767 102 999000 2.034E12 0.04 2011-01-02 NULL I �7��c�
+2 -32767 103 987654321098765432 2.019876E12 NULL 2011-01-02 NULL あ test NULL
+3 32 104 -1234567890123456789 2.01E10 3.14 2011-01-02 0001-12-28 12:34:56 ? * �7��c�
+-4 320 105 0 2.01E10 3.14 2011-01-02 2010-02-28 12:34:56 NULL ||ありがとうございます|| �7��c�
+5 3200 106 NULL 3.14159 3.14 2011-01-02 2011-02-28 12:34:56 ABC NULL
+-6 0 107 -1 NULL 0.00 2011-01-02 2009-02-28 12:34:56 数 AABBCC
+7 NULL 108 65536 2.01E-8 NULL NULL 2099-02-28 12:34:56 数 NULL �7��c�
+NULL 1 109 256 1.01E18 12.00 2011-01-02 2999-12-31 12:34:56 数 NULL �7��c�
+-4 320 999 0 2.01E10 3.14 2011-01-02 2010-02-28 12:34:56 NULL ||ありがとうございます|| �7��c�
+NULL 1 1234 256 1.01E18 12.00 2000-01-02 2999-12-31 12:34:56 数 NULL �7��c�
+7 NULL 999999 65536 2.01E-8 NULL NULL 2099-02-28 12:34:56 数 NULL �7��c�
+PREHOOK: query: SELECT COUNT(*) FROM teradata_binary_table_64kb
+PREHOOK: type: QUERY
+PREHOOK: Input: default@teradata_binary_table_64kb
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT COUNT(*) FROM teradata_binary_table_64kb
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@teradata_binary_table_64kb
+#### A masked pattern was here ####
+50
+PREHOOK: query: SELECT COUNT(*) FROM teradata_binary_table_1mb
+PREHOOK: type: QUERY
+PREHOOK: Input: default@teradata_binary_table_1mb
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT COUNT(*) FROM teradata_binary_table_1mb
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@teradata_binary_table_1mb
+#### A masked pattern was here ####
+20
+PREHOOK: query: SELECT max(date_format(test_timestamp, 'y')) FROM teradata_binary_table_64kb
+PREHOOK: type: QUERY
+PREHOOK: Input: default@teradata_binary_table_64kb
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT max(date_format(test_timestamp, 'y')) FROM teradata_binary_table_64kb
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@teradata_binary_table_64kb
+#### A masked pattern was here ####
+2018
+PREHOOK: query: SELECT max(date_format(test_date, 'y')) FROM teradata_binary_table_64kb
+PREHOOK: type: QUERY
+PREHOOK: Input: default@teradata_binary_table_64kb
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT max(date_format(test_date, 'y')) FROM teradata_binary_table_64kb
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@teradata_binary_table_64kb
+#### A masked pattern was here ####
+2019
+PREHOOK: query: SELECT max(Floor(test_decimal)) FROM teradata_binary_table_64kb
+PREHOOK: type: QUERY
+PREHOOK: Input: default@teradata_binary_table_64kb
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT max(Floor(test_decimal)) FROM teradata_binary_table_64kb
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@teradata_binary_table_64kb
+#### A masked pattern was here ####
+59
+PREHOOK: query: SELECT max(date_format(test_timestamp, 'y')) FROM teradata_binary_table_1mb
+PREHOOK: type: QUERY
+PREHOOK: Input: default@teradata_binary_table_1mb
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT max(date_format(test_timestamp, 'y')) FROM teradata_binary_table_1mb
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@teradata_binary_table_1mb
+#### A masked pattern was here ####
+2999
+PREHOOK: query: SELECT max(date_format(test_date, 'y')) FROM teradata_binary_table_1mb
+PREHOOK: type: QUERY
+PREHOOK: Input: default@teradata_binary_table_1mb
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT max(date_format(test_date, 'y')) FROM teradata_binary_table_1mb
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@teradata_binary_table_1mb
+#### A masked pattern was here ####
+2999
+PREHOOK: query: SELECT max(Floor(test_decimal)) FROM teradata_binary_table_1mb
+PREHOOK: type: QUERY
+PREHOOK: Input: default@teradata_binary_table_1mb
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT max(Floor(test_decimal)) FROM teradata_binary_table_1mb
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@teradata_binary_table_1mb
+#### A masked pattern was here ####
+3140000000000
+PREHOOK: query: SELECT test_tinyint, MAX(test_decimal) FROM teradata_binary_table_64kb GROUP BY test_tinyint
+PREHOOK: type: QUERY
+PREHOOK: Input: default@teradata_binary_table_64kb
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT test_tinyint, MAX(test_decimal) FROM teradata_binary_table_64kb GROUP BY test_tinyint
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@teradata_binary_table_64kb
+#### A masked pattern was here ####
+10 59.99
+PREHOOK: query: SELECT test_tinyint, MAX(test_decimal) FROM teradata_binary_table_1mb GROUP BY test_tinyint
+PREHOOK: type: QUERY
+PREHOOK: Input: default@teradata_binary_table_1mb
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT test_tinyint, MAX(test_decimal) FROM teradata_binary_table_1mb GROUP BY test_tinyint
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@teradata_binary_table_1mb
+#### A masked pattern was here ####
+NULL 12.00
+-127 3.14
+-6 0.00
+-4 3.14
+-1 314.15
+2 NULL
+3 3140000000000.00
+5 314000000.00
+7 NULL
+127 0.04
+PREHOOK: query: INSERT OVERWRITE TABLE teradata_binary_table_64kb_insert
+SELECT test_tinyint, test_decimal, test_date, test_timestamp FROM teradata_binary_table_64kb
+PREHOOK: type: QUERY
+PREHOOK: Input: default@teradata_binary_table_64kb
+PREHOOK: Output: default@teradata_binary_table_64kb_insert
+POSTHOOK: query: INSERT OVERWRITE TABLE teradata_binary_table_64kb_insert
+SELECT test_tinyint, test_decimal, test_date, test_timestamp FROM teradata_binary_table_64kb
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@teradata_binary_table_64kb
+POSTHOOK: Output: default@teradata_binary_table_64kb_insert
+POSTHOOK: Lineage: teradata_binary_table_64kb_insert.test_date SIMPLE [(teradata_binary_table_64kb)teradata_binary_table_64kb.FieldSchema(name:test_date, type:date, comment:from deserializer), ]
+POSTHOOK: Lineage: teradata_binary_table_64kb_insert.test_decimal SIMPLE [(teradata_binary_table_64kb)teradata_binary_table_64kb.FieldSchema(name:test_decimal, type:decimal(15,2), comment:from deserializer), ]
+POSTHOOK: Lineage: teradata_binary_table_64kb_insert.test_timestamp SIMPLE [(teradata_binary_table_64kb)teradata_binary_table_64kb.FieldSchema(name:test_timestamp, type:timestamp, comment:from deserializer), ]
+POSTHOOK: Lineage: teradata_binary_table_64kb_insert.test_tinyint SIMPLE [(teradata_binary_table_64kb)teradata_binary_table_64kb.FieldSchema(name:test_tinyint, type:tinyint, comment:from deserializer), ]
+PREHOOK: query: INSERT OVERWRITE TABLE teradata_binary_table_1mb_insert
+SELECT 1, 15
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@teradata_binary_table_1mb_insert
+POSTHOOK: query: INSERT OVERWRITE TABLE teradata_binary_table_1mb_insert
+SELECT 1, 15
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@teradata_binary_table_1mb_insert
+POSTHOOK: Lineage: teradata_binary_table_1mb_insert.test_int SIMPLE []
+POSTHOOK: Lineage: teradata_binary_table_1mb_insert.test_tinyint EXPRESSION []
+PREHOOK: query: DESC FORMATTED teradata_binary_table_64kb_insert
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@teradata_binary_table_64kb_insert
+POSTHOOK: query: DESC FORMATTED teradata_binary_table_64kb_insert
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@teradata_binary_table_64kb_insert
+# col_name data_type comment
+test_tinyint tinyint from deserializer
+test_decimal decimal(15,2) from deserializer
+test_date date from deserializer
+test_timestamp timestamp from deserializer
+
+# Detailed Table Information
+Database: default
+#### A masked pattern was here ####
+Retention: 0
+#### A masked pattern was here ####
+Table Type: MANAGED_TABLE
+Table Parameters:
+ COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
+ bucketing_version 2
+ numFiles 1
+ numRows 50
+ rawDataSize 0
+ teradata.char.charset LATIN
+ teradata.row.length 64KB
+ teradata.timestamp.precision 0
+ totalSize 1800
+#### A masked pattern was here ####
+
+# Storage Information
+SerDe Library: org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde
+InputFormat: org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat
+OutputFormat: org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat
+Compressed: No
+Num Buckets: -1
+Bucket Columns: []
+Sort Columns: []
+Storage Desc Params:
+ serialization.format 1
+PREHOOK: query: DESC FORMATTED teradata_binary_table_1mb_insert
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@teradata_binary_table_1mb_insert
+POSTHOOK: query: DESC FORMATTED teradata_binary_table_1mb_insert
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@teradata_binary_table_1mb_insert
+# col_name data_type comment
+test_tinyint tinyint from deserializer
+test_int int from deserializer
+
+# Detailed Table Information
+Database: default
+#### A masked pattern was here ####
+Retention: 0
+#### A masked pattern was here ####
+Table Type: MANAGED_TABLE
+Table Parameters:
+ COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"test_int\":\"true\",\"test_tinyint\":\"true\"}}
+ bucketing_version 2
+ numFiles 1
+ numRows 1
+ rawDataSize 0
+ teradata.char.charset UNICODE
+ teradata.row.length 1MB
+ teradata.timestamp.precision 6
+ totalSize 11
+#### A masked pattern was here ####
+
+# Storage Information
+SerDe Library: org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde
+InputFormat: org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat
+OutputFormat: org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat
+Compressed: No
+Num Buckets: -1
+Bucket Columns: []
+Sort Columns: []
+Storage Desc Params:
+ serialization.format 1
+PREHOOK: query: DROP TABLE if exists teradata_binary_table_64kb
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@teradata_binary_table_64kb
+PREHOOK: Output: default@teradata_binary_table_64kb
+POSTHOOK: query: DROP TABLE if exists teradata_binary_table_64kb
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@teradata_binary_table_64kb
+POSTHOOK: Output: default@teradata_binary_table_64kb
+PREHOOK: query: DROP TABLE if exists teradata_binary_table_1mb
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@teradata_binary_table_1mb
+PREHOOK: Output: default@teradata_binary_table_1mb
+POSTHOOK: query: DROP TABLE if exists teradata_binary_table_1mb
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@teradata_binary_table_1mb
+POSTHOOK: Output: default@teradata_binary_table_1mb
+PREHOOK: query: DROP TABLE if exists teradata_binary_table_64kb_insert
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@teradata_binary_table_64kb_insert
+PREHOOK: Output: default@teradata_binary_table_64kb_insert
+POSTHOOK: query: DROP TABLE if exists teradata_binary_table_64kb_insert
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@teradata_binary_table_64kb_insert
+POSTHOOK: Output: default@teradata_binary_table_64kb_insert
+PREHOOK: query: DROP TABLE if exists teradata_binary_table_1mb_insert
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@teradata_binary_table_1mb_insert
+PREHOOK: Output: default@teradata_binary_table_1mb_insert
+POSTHOOK: query: DROP TABLE if exists teradata_binary_table_1mb_insert
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@teradata_binary_table_1mb_insert
+POSTHOOK: Output: default@teradata_binary_table_1mb_insert
http://git-wip-us.apache.org/repos/asf/hive/blob/b8d82844/serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinaryDataInputStream.java
----------------------------------------------------------------------
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinaryDataInputStream.java b/serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinaryDataInputStream.java
new file mode 100644
index 0000000..b26d342
--- /dev/null
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinaryDataInputStream.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.serde2.teradata;
+
+import org.apache.commons.io.input.SwappedDataInputStream;
+import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.hive.common.type.Date;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.common.type.Timestamp;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.math.BigInteger;
+import java.text.ParseException;
+
+import static java.lang.String.format;
+
+/**
+ * The TeradataBinaryDataInputStream is used to handle the Teradata binary format input for record.
+ * Since the TD binary format uses little-endian to handle the SHORT, INT, LONG, DOUBLE and etc.
+ * while the Hadoop uses big-endian,
+ * We extend SwappedDataInputStream to handle these types and extend to handle the Teradata
+ * specific types like VARCHAR, CHAR, TIMESTAMP, DATE...
+ */
+public class TeradataBinaryDataInputStream extends SwappedDataInputStream {
+
+ private static final int DATE_STRING_LENGTH = 8;
+
+ /**
+ * Instantiates a new Teradata binary data input stream.
+ *
+ * @param input the input
+ */
+ public TeradataBinaryDataInputStream(InputStream input) {
+ super(input);
+ }
+
+ /**
+ * Read VARCHAR(N).
+ * The representation of Varchar in Teradata binary format is:
+ * the first two bytes represent the length N of this varchar field,
+ * the next N bytes represent the content of this varchar field.
+ * To pad the null varchar, the length will be 0 and the content will be none.
+ *
+ * @return the string
+ * @throws IOException the io exception
+ */
+ public String readVarchar() throws IOException {
+ int varcharLength = readUnsignedShort();
+ byte[] varcharContent = new byte[varcharLength];
+ int numOfBytesRead = in.read(varcharContent);
+ if (varcharContent.length != 0 && numOfBytesRead != varcharLength) {
+ throw new EOFException(
+ format("Fail to read the varchar. Expect %d bytes, get %d bytes", varcharLength, numOfBytesRead));
+ }
+ //force it to be UTF8 string
+ return new String(varcharContent, "UTF8");
+ }
+
+ /**
+ * Read TIMESTAMP(P).
+ * The representation of timestamp in Teradata binary format is:
+ * the byte number to read is based on the precision of timestamp,
+ * each byte represents one char and the timestamp is using string representation,
+ * eg: for TIMESTAMP(6), we need to read 26 bytes
+ * 31 39 31 31 2d 31 31 2d 31 31 20 31 39 3a 32 30 3a 32 31 2e 34 33 33 32 30 30
+ * will represent 1911-11-11 19:20:21.433200.
+ * the null timestamp will use space to pad.
+ *
+ * @param byteNum the byte number that will be read from inputstream
+ * @return the timestamp
+ * @throws IOException the io exception
+ */
+ public Timestamp readTimestamp(Integer byteNum) throws IOException {
+ // yyyy-mm-dd hh:mm:ss
+ byte[] timestampContent = new byte[byteNum];
+ int numOfBytesRead = in.read(timestampContent);
+ if (timestampContent.length != 0 && numOfBytesRead != byteNum) {
+ throw new EOFException(
+ format("Fail to read the timestamp. Expect %d bytes, get %d bytes", byteNum, numOfBytesRead));
+ }
+ String timestampStr = new String(timestampContent, "UTF8");
+ if (timestampStr.trim().length() == 0) {
+ return null;
+ }
+ return Timestamp.valueOf(timestampStr);
+ }
+
+ /**
+ * Read DATE.
+ * The representation of date in Teradata binary format is:
+ * The Date D is a int with 4 bytes using little endian,
+ * The representation is (D+19000000).ToString -> YYYYMMDD,
+ * eg: Date 07 b2 01 00 -> 111111 in little endian -> 19111111 - > 1911.11.11.
+ * the null date will use 0 to pad.
+ *
+ * @return the date
+ * @throws IOException the io exception
+ * @throws ParseException the parse exception
+ */
+ public Date readDate() throws IOException, ParseException {
+ int di = readInt();
+ if (di == 0) {
+ return null;
+ }
+ String dateString = String.valueOf(di + 19000000);
+ if (dateString.length() < DATE_STRING_LENGTH) {
+ dateString = StringUtils.leftPad(dateString, DATE_STRING_LENGTH, '0');
+ }
+ Date date = new Date();
+ date.setYear(Integer.parseInt(dateString.substring(0, 4)));
+ date.setMonth(Integer.parseInt(dateString.substring(4, 6)));
+ date.setDayOfMonth(Integer.parseInt(dateString.substring(6, 8)));
+ return date;
+ }
+
+ /**
+ * Read CHAR(N).
+ * The representation of char in Teradata binary format is
+ * the byte number to read is based on the [charLength] * [bytePerChar] <- totalLength,
+ * bytePerChar is decided by the charset: LATAIN charset is 2 bytes per char and UNICODE charset is 3 bytes per char.
+ * the null char will use space to pad.
+ *
+ * @param totalLength the total length
+ * @return the string
+ * @throws IOException the io exception
+ */
+ public String readChar(int totalLength) throws IOException {
+ byte[] charContent = new byte[totalLength];
+ int numOfBytesRead = in.read(charContent);
+ if (charContent.length != 0 && numOfBytesRead != totalLength) {
+ throw new EOFException(
+ format("Fail to read the varchar. Expect %d bytes, get %d bytes", totalLength, numOfBytesRead));
+ }
+ return new String(charContent, "UTF8");
+ }
+
+ /**
+ * Read DECIMAL(P, S).
+ * The representation of decimal in Teradata binary format is
+ * the byte number to read is decided solely by the precision(P),
+ * HiveDecimal is constructed through the byte array and scale.
+ * the null DECIMAL will use 0x00 to pad.
+ *
+ * @param scale the scale
+ * @param byteNum the byte num
+ * @return the hive decimal
+ * @throws IOException the io exception
+ */
+ public HiveDecimal readDecimal(int scale, int byteNum) throws IOException {
+ byte[] decimalContent = new byte[byteNum];
+ int numOfBytesRead = in.read(decimalContent);
+ if (decimalContent.length != 0 && numOfBytesRead != byteNum) {
+ throw new EOFException(
+ format("Fail to read the decimal. Expect %d bytes, get %d bytes", byteNum, numOfBytesRead));
+ }
+ ArrayUtils.reverse(decimalContent);
+ return HiveDecimal.create(new BigInteger(decimalContent), scale);
+ }
+
+ /**
+ * Read VARBYTE(N).
+ * The representation of VARBYTE in Teradata binary format is:
+ * the first two bytes represent the length N of this varchar field
+ * the next N bytes represent the content of this varchar field.
+ * To pad the null varbyte, the length will be 0 and the content will be none.
+ *
+ * @return the byte [ ]
+ * @throws IOException the io exception
+ */
+ public byte[] readVarbyte() throws IOException {
+ int varbyteLength = readUnsignedShort();
+ byte[] varbyteContent = new byte[varbyteLength];
+ int numOfBytesRead = in.read(varbyteContent);
+ if (varbyteContent.length != 0 && numOfBytesRead != varbyteLength) {
+ throw new EOFException(
+ format("Fail to read the varbyte. Expect %d bytes, get %d bytes", varbyteLength, numOfBytesRead));
+ }
+ return varbyteContent;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/b8d82844/serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinaryDataOutputStream.java
----------------------------------------------------------------------
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinaryDataOutputStream.java b/serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinaryDataOutputStream.java
new file mode 100644
index 0000000..f2f801d
--- /dev/null
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinaryDataOutputStream.java
@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.serde2.teradata;
+
+import org.apache.commons.io.EndianUtils;
+import org.apache.commons.io.output.ByteArrayOutputStream;
+import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.serde2.io.DateWritableV2;
+import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
+import org.apache.hadoop.hive.serde2.io.TimestampWritableV2;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.Text;
+
+import java.io.IOException;
+import java.math.BigInteger;
+import java.util.Arrays;
+import java.util.Collections;
+
+import static java.lang.String.join;
+import static java.lang.String.format;
+
+
+/**
+ * The TeradataBinaryDataOutputStream is used to produce the output in compliance with the Teradata binary format,
+ * so the output can be directly used to load into Teradata DB using TPT fastload.
+ * Since the TD binary format uses little-endian to handle the SHORT, INT, LONG, DOUBLE and etc.
+ * while the Hadoop uses big-endian,
+ * We extend SwappedDataInputStream to return qualified bytes for these types and extend to handle the Teradata
+ * specific types like VARCHAR, CHAR, TIMESTAMP, DATE...
+ */
+public class TeradataBinaryDataOutputStream extends ByteArrayOutputStream {
+
+ private static final Log LOG = LogFactory.getLog(TeradataBinaryDataOutputStream.class);
+
+ private static final int TIMESTAMP_NO_NANOS_BYTE_NUM = 19;
+
+ public TeradataBinaryDataOutputStream() {
+ }
+
+ /**
+ * Write VARCHAR(N).
+ * The representation of Varchar in Teradata binary format is:
+ * the first two bytes represent the length N of this varchar field,
+ * the next N bytes represent the content of this varchar field.
+ * To pad the null varchar, the length will be 0 and the content will be none.
+ *
+ * @param writable the writable
+ * @throws IOException the io exception
+ */
+ public void writeVarChar(HiveVarcharWritable writable) throws IOException {
+ if (writable == null) {
+ EndianUtils.writeSwappedShort(this, (short) 0);
+ return;
+ }
+ Text t = writable.getTextValue();
+ int varcharLength = t.getLength();
+ EndianUtils.writeSwappedShort(this, (short) varcharLength); // write the varchar length
+ write(t.getBytes(), 0, varcharLength); // write the varchar content
+ }
+
+ /**
+ * Write INT.
+ * using little-endian to write integer.
+ *
+ * @param i the
+ * @throws IOException the io exception
+ */
+ public void writeInt(int i) throws IOException {
+ EndianUtils.writeSwappedInteger(this, i);
+ }
+
+ /**
+ * Write TIMESTAMP(N).
+ * The representation of timestamp in Teradata binary format is:
+ * the byte number to read is based on the precision of timestamp,
+ * each byte represents one char and the timestamp is using string representation,
+ * eg: for 1911-11-11 19:20:21.433200 in TIMESTAMP(3), we will cut it to be 1911-11-11 19:20:21.433 and write
+ * 31 39 31 31 2d 31 31 2d 31 31 20 31 39 3a 32 30 3a 32 31 2e 34 33 33.
+ * the null timestamp will use space to pad.
+ *
+ * @param timestamp the timestamp
+ * @param byteNum the byte number the timestamp will write
+ * @throws IOException the io exception
+ */
+ public void writeTimestamp(TimestampWritableV2 timestamp, int byteNum) throws IOException {
+ if (timestamp == null) {
+ String pad = join("", Collections.nCopies(byteNum, " "));
+ write(pad.getBytes("UTF8"));
+ return;
+ }
+ String sTimeStamp = timestamp.getTimestamp().toString();
+ if (sTimeStamp.length() >= byteNum) {
+ write(sTimeStamp.substring(0, byteNum).getBytes("UTF8"));
+ return;
+ }
+ write(sTimeStamp.getBytes("UTF8"));
+ String pad;
+ if (sTimeStamp.length() == TIMESTAMP_NO_NANOS_BYTE_NUM) {
+ pad = "." + join("", Collections.nCopies(byteNum - sTimeStamp.length() - 1, "0"));
+ } else {
+ pad = join("", Collections.nCopies(byteNum - sTimeStamp.length(), "0"));
+ }
+ write(pad.getBytes("UTF8"));
+ }
+
+ /**
+ * Write DOUBLE.
+ * using little-endian to write double.
+ *
+ * @param d the d
+ * @throws IOException the io exception
+ */
+ public void writeDouble(double d) throws IOException {
+ EndianUtils.writeSwappedDouble(this, d);
+ }
+
+ /**
+ * Write DATE.
+ * The representation of date in Teradata binary format is:
+ * The Date D is a int with 4 bytes using little endian.
+ * The representation is (YYYYMMDD - 19000000).toInt -> D
+ * eg. 1911.11.11 -> 19111111 -> 111111 -> 07 b2 01 00 in little endian.
+ * the null date will use 0 to pad.
+ *
+ * @param date the date
+ * @throws IOException the io exception
+ */
+ public void writeDate(DateWritableV2 date) throws IOException {
+ if (date == null) {
+ EndianUtils.writeSwappedInteger(this, 0);
+ return;
+ }
+ int toWrite = date.get().getYear() * 10000 + date.get().getMonth() * 100 + date.get().getDay() - 19000000;
+ EndianUtils.writeSwappedInteger(this, toWrite);
+ }
+
+ /**
+ * Write LONG.
+ * using little-endian to write double.
+ *
+ * @param l the l
+ * @throws IOException the io exception
+ */
+ public void writeLong(long l) throws IOException {
+ EndianUtils.writeSwappedLong(this, l);
+ }
+
+ /**
+ * Write CHAR(N).
+ * The representation of char in Teradata binary format is:
+ * the byte number to read is based on the [charLength] * [bytePerChar] <- totalLength,
+ * bytePerChar is decided by the charset: LATAIN charset is 2 bytes per char and UNICODE charset is 3 bytes per char.
+ * the null char will use space to pad.
+ *
+ * @param writable the writable
+ * @param length the byte n
+ * @throws IOException the io exception
+ */
+ public void writeChar(HiveCharWritable writable, int length) throws IOException {
+ if (writable == null) {
+ String pad = join("", Collections.nCopies(length, " "));
+ write(pad.getBytes("UTF8"));
+ return;
+ }
+ Text t = writable.getStrippedValue();
+ int contentLength = t.getLength();
+ write(t.getBytes(), 0, contentLength);
+ if (length - contentLength < 0) {
+ throw new IOException(format("The byte num %s of HiveCharWritable is more than the byte num %s we can hold. "
+ + "The content of HiveCharWritable is %s", contentLength, length, writable.getPaddedValue()));
+ }
+ if (length > contentLength) {
+ String pad = join("", Collections.nCopies(length - contentLength, " "));
+ write(pad.getBytes("UTF8"));
+ }
+ }
+
+ /**
+ * Write DECIMAL(P, S).
+ * The representation of decimal in Teradata binary format is:
+ * the byte number to read is decided solely by the precision(P),
+ * HiveDecimal is constructed through the byte array and scale.
+ * the rest of byte will use 0x00 to pad (positive) and use 0xFF to pad (negative).
+ * the null DECIMAL will use 0x00 to pad.
+ *
+ * @param writable the writable
+ * @param byteNum the byte num
+ * @throws IOException the io exception
+ */
+ public void writeDecimal(HiveDecimalWritable writable, int byteNum, int scale) throws IOException {
+ if (writable == null) {
+ byte[] pad = new byte[byteNum];
+ write(pad);
+ return;
+ }
+ // since the HiveDecimal will auto adjust the scale to save resource
+ // we need to adjust it back otherwise the output bytes will be wrong
+ int hiveScale = writable.getHiveDecimal().scale();
+ BigInteger bigInteger = writable.getHiveDecimal().unscaledValue();
+ if (hiveScale < scale) {
+ BigInteger multiplicand = new BigInteger("1" + join("", Collections.nCopies(scale - hiveScale, "0")));
+ bigInteger = bigInteger.multiply(multiplicand);
+ }
+ byte[] content = bigInteger.toByteArray();
+ int signBit = content[0] >> 7 & 1;
+ ArrayUtils.reverse(content);
+ write(content);
+ if (byteNum > content.length) {
+ byte[] pad;
+ if (signBit == 0) {
+ pad = new byte[byteNum - content.length];
+ } else {
+ pad = new byte[byteNum - content.length];
+ Arrays.fill(pad, (byte) 255);
+ }
+ write(pad);
+ }
+ }
+
+ /**
+ * Write SHORT.
+ * using little-endian to write short.
+ *
+ * @param s the s
+ * @throws IOException the io exception
+ */
+ public void writeShort(short s) throws IOException {
+ EndianUtils.writeSwappedShort(this, s);
+ }
+
+ /**
+ * Write VARBYTE(N).
+ * The representation of VARBYTE in Teradata binary format is:
+ * the first two bytes represent the length N of this varchar field,
+ * the next N bytes represent the content of this varchar field.
+ * To pad the null varbyte, the length will be 0 and the content will be none.
+ *
+ * @param writable the writable
+ * @throws IOException the io exception
+ */
+ public void writeVarByte(BytesWritable writable) throws IOException {
+ if (writable == null) {
+ EndianUtils.writeSwappedShort(this, (short) 0);
+ return;
+ }
+ int varbyteLength = writable.getLength();
+ EndianUtils.writeSwappedShort(this, (short) varbyteLength); // write the varbyte length
+ write(writable.getBytes(), 0, varbyteLength); // write the varchar content
+ }
+}