You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by ju...@apache.org on 2017/06/07 22:22:33 UTC

parquet-mr git commit: Parquet-884: Add support for Decimal datatype to Parquet-Pig record reader

Repository: parquet-mr
Updated Branches:
  refs/heads/master 9491d7a61 -> 9d58b6a83


Parquet-884: Add support for Decimal datatype to Parquet-Pig record reader

Adds conversion support to Pig for Decimal datatype. Based on the scala code in the spark project that provides a similar function for their sql library.

Author: EllenKletscher <el...@capitalone.com>

Closes #404 from EllenKletscher/master and squashes the following commits:

7714738 [EllenKletscher] add comment for precision check
50c75c8 [EllenKletscher] remove check for primitiveType null
08d4dbb [EllenKletscher] PARQUET-884: Add missing AL header
57c4d72 [EllenKletscher] PARQUET-884: Add missing AL header
ea61267 [EllenKletscher] PARQUET-884: add support for decimal type to pig reader


Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/9d58b6a8
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/9d58b6a8
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/9d58b6a8

Branch: refs/heads/master
Commit: 9d58b6a83aa79dcad01c3bcc2ec0a7db74ba83b1
Parents: 9491d7a
Author: EllenKletscher <el...@capitalone.com>
Authored: Wed Jun 7 15:22:28 2017 -0700
Committer: Julien Le Dem <ju...@apache.org>
Committed: Wed Jun 7 15:22:28 2017 -0700

----------------------------------------------------------------------
 .../apache/parquet/pig/PigSchemaConverter.java  |  8 +-
 .../parquet/pig/convert/DecimalUtils.java       | 65 ++++++++++++++++
 .../parquet/pig/convert/TupleConverter.java     | 27 +++++++
 .../apache/parquet/pig/TestDecimalUtils.java    | 79 ++++++++++++++++++++
 4 files changed, 177 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/9d58b6a8/parquet-pig/src/main/java/org/apache/parquet/pig/PigSchemaConverter.java
----------------------------------------------------------------------
diff --git a/parquet-pig/src/main/java/org/apache/parquet/pig/PigSchemaConverter.java b/parquet-pig/src/main/java/org/apache/parquet/pig/PigSchemaConverter.java
index c9eb0ba..e560e42 100644
--- a/parquet-pig/src/main/java/org/apache/parquet/pig/PigSchemaConverter.java
+++ b/parquet-pig/src/main/java/org/apache/parquet/pig/PigSchemaConverter.java
@@ -244,8 +244,12 @@ public class PigSchemaConverter {
 
       @Override
       public FieldSchema convertFIXED_LEN_BYTE_ARRAY(
-          PrimitiveTypeName primitiveTypeName) throws FrontendException {
-        return new FieldSchema(fieldName, null, DataType.BYTEARRAY);
+        PrimitiveTypeName primitiveTypeName) throws FrontendException {
+        if (originalType == OriginalType.DECIMAL) {
+          return new FieldSchema(fieldName, null, DataType.BIGDECIMAL);
+        } else {
+          return new FieldSchema(fieldName, null, DataType.BYTEARRAY);
+        }
       }
 
       @Override

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/9d58b6a8/parquet-pig/src/main/java/org/apache/parquet/pig/convert/DecimalUtils.java
----------------------------------------------------------------------
diff --git a/parquet-pig/src/main/java/org/apache/parquet/pig/convert/DecimalUtils.java b/parquet-pig/src/main/java/org/apache/parquet/pig/convert/DecimalUtils.java
new file mode 100644
index 0000000..f850332
--- /dev/null
+++ b/parquet-pig/src/main/java/org/apache/parquet/pig/convert/DecimalUtils.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.pig.convert;
+
+import java.nio.ByteBuffer;
+import java.math.BigInteger;
+import java.math.BigDecimal;
+import static java.lang.Math.pow;
+
+import org.apache.parquet.io.api.Binary;
+
+/*
+ * Conversion between Parquet Decimal Type to Java BigDecimal in Pig
+ * Code Based on the Apache Spark ParquetRowConverter.scala
+ * 
+ *
+ */
+
+public class DecimalUtils {
+
+  public static BigDecimal binaryToDecimal(Binary value, int precision, int scale) {
+    /*
+     * Precision <= 18 checks for the max number of digits for an unscaled long,
+     * else treat with big integer conversion
+     */
+    if (precision <= 18) {
+      ByteBuffer buffer = value.toByteBuffer();
+      byte[] bytes = buffer.array();
+      int start = buffer.arrayOffset() + buffer.position();
+      int end = buffer.arrayOffset() + buffer.limit();
+      long unscaled = 0L;
+      int i = start;
+      while ( i < end ) {
+        unscaled = ( unscaled << 8 | bytes[i] & 0xff );
+        i++;
+      }
+      int bits = 8*(end - start);
+      long unscaledNew = (unscaled << (64 - bits)) >> (64 - bits);
+      if (unscaledNew <= -pow(10,18) || unscaledNew >= pow(10,18)) {
+        return new BigDecimal(unscaledNew);
+      } else {
+        return BigDecimal.valueOf(unscaledNew / pow(10,scale));
+      }
+    } else {
+      return new BigDecimal(new BigInteger(value.getBytes()), scale);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/9d58b6a8/parquet-pig/src/main/java/org/apache/parquet/pig/convert/TupleConverter.java
----------------------------------------------------------------------
diff --git a/parquet-pig/src/main/java/org/apache/parquet/pig/convert/TupleConverter.java b/parquet-pig/src/main/java/org/apache/parquet/pig/convert/TupleConverter.java
index 3887332..1c7ab6c 100644
--- a/parquet-pig/src/main/java/org/apache/parquet/pig/convert/TupleConverter.java
+++ b/parquet-pig/src/main/java/org/apache/parquet/pig/convert/TupleConverter.java
@@ -21,6 +21,7 @@ package org.apache.parquet.pig.convert;
 import static java.lang.Math.max;
 import java.util.ArrayList;
 import java.util.List;
+import java.math.BigDecimal;
 
 import org.apache.pig.backend.executionengine.ExecException;
 import org.apache.pig.data.DataByteArray;
@@ -39,9 +40,11 @@ import org.apache.parquet.io.api.Converter;
 import org.apache.parquet.io.api.GroupConverter;
 import org.apache.parquet.io.api.PrimitiveConverter;
 import org.apache.parquet.pig.TupleConversionException;
+import org.apache.parquet.pig.convert.DecimalUtils;
 import org.apache.parquet.schema.GroupType;
 import org.apache.parquet.schema.OriginalType;
 import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.DecimalMetadata;
 import org.apache.parquet.schema.Type;
 import org.apache.parquet.schema.Type.Repetition;
 
@@ -140,6 +143,8 @@ public class TupleConverter extends GroupConverter {
         return new FieldDoubleConverter(parent);
       case DataType.LONG:
         return new FieldLongConverter(parent);
+      case DataType.BIGDECIMAL:
+        return new FieldBigDecimalConverter(type, parent);
       default:
         throw new TupleConversionException("unsupported pig type: " + pigField);
       }
@@ -530,6 +535,28 @@ public class TupleConverter extends GroupConverter {
   }
 
   /**
+   * handle decimal type
+   *
+   */
+  static final class FieldBigDecimalConverter extends PrimitiveConverter {
+    private final ParentValueContainer parent;
+    private final Type primitiveType;
+    public FieldBigDecimalConverter(Type primitiveType, ParentValueContainer parent) {
+      this.parent = parent;
+      this.primitiveType = primitiveType;
+    }
+
+    @Override
+    final public void addBinary(Binary value) {
+      int precision = primitiveType.asPrimitiveType().getDecimalMetadata().getPrecision();
+      int scale = primitiveType.asPrimitiveType().getDecimalMetadata().getScale();
+      BigDecimal finaldecimal = DecimalUtils.binaryToDecimal(value, precision, scale);
+      parent.add(finaldecimal);
+    }
+  }
+
+
+  /**
    * Converts groups into bags
    *
    * @author Julien Le Dem

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/9d58b6a8/parquet-pig/src/test/java/org/apache/parquet/pig/TestDecimalUtils.java
----------------------------------------------------------------------
diff --git a/parquet-pig/src/test/java/org/apache/parquet/pig/TestDecimalUtils.java b/parquet-pig/src/test/java/org/apache/parquet/pig/TestDecimalUtils.java
new file mode 100644
index 0000000..3b4afe8
--- /dev/null
+++ b/parquet-pig/src/test/java/org/apache/parquet/pig/TestDecimalUtils.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.pig;
+
+import static org.junit.Assert.assertEquals;
+import org.junit.Test;
+
+import java.math.BigDecimal;
+import static java.lang.Math.abs;
+import java.nio.ByteBuffer;
+
+import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.pig.convert.DecimalUtils;
+
+public class TestDecimalUtils {
+  
+  private void testDecimalConversion(double value, int precision, int scale, String stringValue) {
+    String originalString = Double.toString(value);
+    BigDecimal originalValue = new BigDecimal(originalString);
+    BigDecimal convertedValue = DecimalUtils.binaryToDecimal(Binary.fromByteArray(originalValue.unscaledValue().toByteArray()),
+                                                             precision,scale);
+    assertEquals(stringValue, convertedValue.toString());
+  }
+
+  private void testDecimalConversion(int value, int precision, int scale, String stringValue) {
+    String originalString = Integer.toString(value);
+    BigDecimal originalValue = new BigDecimal(originalString);
+    BigDecimal convertedValue = DecimalUtils.binaryToDecimal(Binary.fromByteArray(originalValue.unscaledValue().toByteArray()),
+                                                             precision,scale);
+    assertEquals(stringValue, convertedValue.toString());
+  }
+
+  private void testDecimalConversion(long value, int precision, int scale, String stringValue) {
+    String originalString = Long.toString(value);
+    BigDecimal originalValue = new BigDecimal(originalString);
+    BigDecimal convertedValue = DecimalUtils.binaryToDecimal(Binary.fromByteArray(originalValue.unscaledValue().toByteArray()),
+                                                             precision, scale);
+    assertEquals(stringValue, convertedValue.toString());
+  }
+
+  @Test
+  public void testBinaryToDecimal() throws Exception {
+    // Known issue: testing Nx10^M doubles from BigDecimal.unscaledValue() always converts to Nx10 regardless of M
+    // Known issue: any double with precision > 17 breaks in test but not in functional testing
+    
+    // Test LONG
+    testDecimalConversion(Long.MAX_VALUE,19,0,"9223372036854775807");
+    testDecimalConversion(Long.MIN_VALUE,19,0,"-9223372036854775808");
+    testDecimalConversion(0L,0,0,"0.0");
+
+    // Test INTEGER
+    testDecimalConversion(Integer.MAX_VALUE,10,0,"2147483647");
+    testDecimalConversion(Integer.MIN_VALUE,10,0,"-2147483648");
+    testDecimalConversion(0,0,0,"0.0");
+
+    // Test DOUBLE
+    testDecimalConversion(12345678912345678d,17,0,"12345678912345678");
+    testDecimalConversion(123456789123456.78,17,2,"123456789123456.78");
+    testDecimalConversion(0.12345678912345678,17,17,"0.12345678912345678");
+    testDecimalConversion(-0.000102,6,6,"-0.000102");
+  }
+}