You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by dv...@apache.org on 2014/09/04 00:37:18 UTC

git commit: do ProtocolEvents fixing only when there is required fields missing in the requested schema

Repository: incubator-parquet-mr
Updated Branches:
  refs/heads/master 7a105068e -> f8b06df7a


do ProtocolEvents fixing only when there is required fields missing in the requested schema

https://issues.apache.org/jira/browse/PARQUET-61
This PR is trying to redo the https://github.com/apache/incubator-parquet-mr/pull/7

In this PR, it fixes the protocol event in a more precise condition:
Only when the requested schema missing some required fields that are present in the full schema

So even if there a projection, as long as the projection is not getting rid of the required field, the protocol events amender will not be called.

Could you take a look at this ? @dvryaboy @yan-qi

Author: Tianshuo Deng <td...@twitter.com>

Closes #28 from tsdeng/fix_protocol_when_required_field_missing and squashes the following commits:

ba778b9 [Tianshuo Deng] add continue for readability
d5639df [Tianshuo Deng] fix unused import
090e894 [Tianshuo Deng] format
13a609d [Tianshuo Deng] comment format
ef1fe58 [Tianshuo Deng] little refactor, remove the hasMissingRequiredFieldFromProjection method
7c2c158 [Tianshuo Deng] format
83a5655 [Tianshuo Deng] do ProtocolEvents fixing only when there is required fields missing in the requested schema


Project: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/commit/f8b06df7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/tree/f8b06df7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/diff/f8b06df7

Branch: refs/heads/master
Commit: f8b06df7a56f92f4bc7dd564ad7ec026e3b4f3da
Parents: 7a10506
Author: Tianshuo Deng <td...@twitter.com>
Authored: Wed Sep 3 15:37:00 2014 -0700
Committer: dvryaboy <dv...@gmail.com>
Committed: Wed Sep 3 15:37:00 2014 -0700

----------------------------------------------------------------------
 .../parquet/thrift/ThriftRecordConverter.java   | 37 ++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-parquet-mr/blob/f8b06df7/parquet-thrift/src/main/java/parquet/thrift/ThriftRecordConverter.java
----------------------------------------------------------------------
diff --git a/parquet-thrift/src/main/java/parquet/thrift/ThriftRecordConverter.java b/parquet-thrift/src/main/java/parquet/thrift/ThriftRecordConverter.java
index 4157693..fa5e092 100644
--- a/parquet-thrift/src/main/java/parquet/thrift/ThriftRecordConverter.java
+++ b/parquet-thrift/src/main/java/parquet/thrift/ThriftRecordConverter.java
@@ -778,6 +778,7 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> {
   private final ParquetReadProtocol protocol;
   private final GroupConverter structConverter;
   private List<TProtocol> rootEvents = new ArrayList<TProtocol>();
+  private boolean missingRequiredFieldsInProjection = false;
 
   /**
    *
@@ -791,9 +792,36 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> {
     this.thriftReader = thriftReader;
     this.protocol = new ParquetReadProtocol();
     this.thriftType = thriftType;
+    MessageType fullSchema = new ThriftSchemaConverter().convert(thriftType);
+    missingRequiredFieldsInProjection = hasMissingRequiredFieldInGroupType(requestedParquetSchema, fullSchema);
     this.structConverter = new StructConverter(rootEvents, requestedParquetSchema, new ThriftField(name, (short)0, Requirement.REQUIRED, thriftType));
   }
 
+  private boolean hasMissingRequiredFieldInGroupType(GroupType requested, GroupType fullSchema) {
+    for (Type field : fullSchema.getFields()) {
+
+      if (requested.containsField(field.getName())) {
+        Type requestedType = requested.getType(field.getName());
+        // if a field is in requested schema and the type of it is a group type, then do recursive check
+        if (!field.isPrimitive()) {
+          if (hasMissingRequiredFieldInGroupType(requestedType.asGroupType(), field.asGroupType())) {
+            return true;
+          } else {
+            continue;// check next field
+          }
+        }
+      } else {
+        if (field.getRepetition() == Type.Repetition.REQUIRED) {
+          return true; // if a field is missing in requested schema and it's required
+        } else {
+          continue; // the missing field is not required, then continue checking next field
+        }
+      }
+    }
+
+    return false;
+  }
+
   /**
    *
    * {@inheritDoc}
@@ -802,8 +830,13 @@ public class ThriftRecordConverter<T> extends RecordMaterializer<T> {
   @Override
   public T getCurrentRecord() {
     try {
-      List<TProtocol> fixedEvents = new ProtocolEventsAmender(rootEvents).amendMissingRequiredFields(thriftType);
-      protocol.addAll(fixedEvents);
+      if (missingRequiredFieldsInProjection) {
+        List<TProtocol> fixedEvents = new ProtocolEventsAmender(rootEvents).amendMissingRequiredFields(thriftType);
+        protocol.addAll(fixedEvents);
+      } else {
+        protocol.addAll(rootEvents);
+      }
+
       rootEvents.clear();
       return thriftReader.readOneRecord(protocol);
     } catch (TException e) {