You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@parquet.apache.org by "ASF GitHub Bot (JIRA)" <ji...@apache.org> on 2018/09/24 12:14:00 UTC

[jira] [Commented] (PARQUET-7) [parquet-thrift] improve performance of thrift push-down code

    [ https://issues.apache.org/jira/browse/PARQUET-7?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16625727#comment-16625727 ] 

ASF GitHub Bot commented on PARQUET-7:
--------------------------------------

gszadovszky closed pull request #7: PARQUET-7: [thrift] avoid thrift amender if all fields are optional.
URL: https://github.com/apache/parquet-mr/pull/7
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/parquet-thrift/src/main/java/parquet/thrift/ThriftRecordConverter.java b/parquet-thrift/src/main/java/parquet/thrift/ThriftRecordConverter.java
index bf0a7df08..071bcb4fa 100644
--- a/parquet-thrift/src/main/java/parquet/thrift/ThriftRecordConverter.java
+++ b/parquet-thrift/src/main/java/parquet/thrift/ThriftRecordConverter.java
@@ -18,9 +18,9 @@
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
-
 import org.apache.thrift.TException;
 import org.apache.thrift.protocol.TField;
 import org.apache.thrift.protocol.TList;
@@ -778,7 +778,12 @@ public void end() {
   private final ParquetReadProtocol protocol;
   private final GroupConverter structConverter;
   private List<TProtocol> rootEvents = new ArrayList<TProtocol>();
+  boolean hasRequiredFields = false;
 
+  //TODO(dmitriy): make this expire things
+  // we probably want to cache this since we might have to keep re-examining the same struct in different
+  // instances of the same converter
+  private static Map<ThriftType.StructType, Boolean> hasRequiredFieldCache = new HashMap<ThriftType.StructType, Boolean>();
   /**
    *
    * @param thriftReader the class responsible for instantiating the final object and read from the protocol
@@ -791,9 +796,54 @@ public ThriftRecordConverter(ThriftReader<T> thriftReader, String name, MessageT
     this.thriftReader = thriftReader;
     this.protocol = new ParquetReadProtocol();
     this.thriftType = thriftType;
+    if (!hasRequiredFieldCache.containsKey(thriftType)) {
+      hasRequiredFieldCache.put(thriftType, somethingIsRequiredInStruct(thriftType));
+    }
+    this.hasRequiredFields = hasRequiredFieldCache.get(thriftType);
     this.structConverter = new StructConverter(rootEvents, requestedParquetSchema, new ThriftField(name, (short)0, Requirement.REQUIRED, thriftType));
   }
 
+  private boolean fieldIsRequired(ThriftField thriftField) {
+    boolean isRequired = (thriftField.getRequirement() == ThriftField.Requirement.REQUIRED);
+    if (isRequired) return true;
+
+    ThriftType elementType = thriftField.getType();
+    ThriftTypeID elementFieldTypeID = elementType.getType();
+    switch (elementFieldTypeID) {
+      case STRUCT:
+        return somethingIsRequiredInStruct((ThriftType.StructType) elementType);
+      case LIST:
+        return somethingIsRequiredInList((ThriftType.ListType) elementType);
+      case MAP:
+        return somethingIsRequiredInMap((ThriftType.MapType) elementType);
+      case SET:
+        return somethingIsRequiredInSet((ThriftType.SetType) elementType);
+      default:
+        return false;
+    }
+  }
+
+  private boolean somethingIsRequiredInList(ThriftType.ListType thriftType) {
+    return fieldIsRequired(thriftType.getValues());
+  }
+  private boolean somethingIsRequiredInSet(ThriftType.SetType thriftType) {
+    return fieldIsRequired(thriftType.getValues());
+  }
+
+  private boolean somethingIsRequiredInMap(ThriftType.MapType thriftType) {
+    return fieldIsRequired(thriftType.getKey()) || fieldIsRequired(thriftType.getValue());
+  }
+
+  private boolean somethingIsRequiredInStruct(ThriftType.StructType thriftType) {
+    boolean isRequired = false;
+    Iterator<ThriftField> childrenIter = thriftType.getChildren().iterator();
+    while (!isRequired && childrenIter.hasNext()) {
+      ThriftField field = childrenIter.next();
+      isRequired = fieldIsRequired(field);
+    }
+    return isRequired;
+  }
+
   /**
    *
    * {@inheritDoc}
@@ -802,10 +852,16 @@ public ThriftRecordConverter(ThriftReader<T> thriftReader, String name, MessageT
   @Override
   public T getCurrentRecord() {
     try {
-      List<TProtocol> fixedEvents = new ProtocolEventsAmender(rootEvents).amendMissingRequiredFields(thriftType);
-      protocol.addAll(fixedEvents);
-      rootEvents.clear();
-      return thriftReader.readOneRecord(protocol);
+      if (hasRequiredFields) {
+        List<TProtocol> fixedEvents = new ProtocolEventsAmender(rootEvents).amendMissingRequiredFields(thriftType);
+        protocol.addAll(fixedEvents);
+        rootEvents.clear();
+        return thriftReader.readOneRecord(protocol);
+     } else {
+       protocol.addAll(rootEvents);
+       rootEvents.clear();
+       return thriftReader.readOneRecord(protocol);
+     }
     } catch (TException e) {
       throw new ParquetDecodingException("Could not read thrift object from protocol", e);
     }
diff --git a/parquet-thrift/src/main/java/parquet/thrift/projection/amend/DefaultProtocolEventsGenerator.java b/parquet-thrift/src/main/java/parquet/thrift/projection/amend/DefaultProtocolEventsGenerator.java
index ce56f861e..ce26f2ca1 100644
--- a/parquet-thrift/src/main/java/parquet/thrift/projection/amend/DefaultProtocolEventsGenerator.java
+++ b/parquet-thrift/src/main/java/parquet/thrift/projection/amend/DefaultProtocolEventsGenerator.java
@@ -49,7 +49,7 @@ public void readStructEnd() throws TException {
     public void readFieldEnd() throws TException {
     }
   };
-  List createdEvents = new ArrayList<TProtocol>();
+  List<TProtocol> createdEvents = new ArrayList<TProtocol>();
 
   public List<TProtocol> createProtocolEventsForField(ThriftField missingField) {
     TProtocol fieldBegin = new ReadFieldBeginProtocol(missingField);


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> [parquet-thrift] improve performance of thrift push-down code
> -------------------------------------------------------------
>
>                 Key: PARQUET-7
>                 URL: https://issues.apache.org/jira/browse/PARQUET-7
>             Project: Parquet
>          Issue Type: Wish
>            Reporter: Dmitriy V. Ryaboy
>            Priority: Major
>              Labels: pull-request-available
>
> A user reported seeing slowness when projection push-down code is active, which seems to stem from ProtocolEventsAmender.
> Details can be found in https://github.com/Parquet/parquet-mr/issues/406



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)