You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@orc.apache.org by GitBox <gi...@apache.org> on 2021/04/09 14:43:36 UTC
[GitHub] [orc] pgaref commented on a change in pull request #668: ORC-742: Added Lazy IO for non filter columns

pgaref commented on a change in pull request #668:
URL: https://github.com/apache/orc/pull/668#discussion_r610654927



##########
File path: java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
##########
@@ -958,19 +981,21 @@ public SargApplier(SearchArgument sarg,
       filterColumns = mapSargColumnsToOrcInternalColIdx(sargLeaves,
                                                         evolution);
       this.rowIndexStride = rowIndexStride;
+      this.evolution = evolution;
+      exceptionCount = new long[sargLeaves.size()];
+      this.useUTCTimestamp = useUTCTimestamp;
+    }
+
+    public void setRowIndexCols(boolean[] rowIndexCols) {

Review comment:
       Is this change related? Shall we move it a separate ticket if thats the case?

##########
File path: java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
##########
@@ -230,15 +247,22 @@ protected RecordReaderImpl(ReaderImpl fileReader,
     SortedSet<Integer> filterColIds = new TreeSet<>();
     if (options.getPreFilterColumnNames() != null) {
       for (String colName : options.getPreFilterColumnNames()) {
-        int expandColId = findColumns(evolution, colName);
+        TypeDescription expandCol = findColumnType(evolution, colName);
         // If the column is not present in the file then this can be ignored from read.
-        if (expandColId != -1) {
-          filterColIds.add(expandColId);
+        while(expandCol != null && expandCol.getId() != -1) {

Review comment:
       missing space

##########
File path: java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
##########
@@ -1302,25 +1427,33 @@ private int findStripe(long rowNumber) {
     throw new IllegalArgumentException("Seek after the end of reader range");
   }
 
-  public OrcIndex readRowIndex(int stripeIndex, boolean[] included,
-                               boolean[] sargColumns) throws IOException {
-    // if this is the current stripe, use the cached objects.
-    if (stripeIndex == currentStripe) {
-      return planner.readRowIndex(
-          sargColumns != null || sargApp == null
-              ? sargColumns : sargApp.sargColumns, indexes);
+  private void readCurrentStripeRowIndex() throws IOException {
+    planner.readRowIndex(rowIndexColsToRead, indexes);
+  }
+
+  public OrcIndex readRowIndex(int stripeIndex,
+                               boolean[] included,
+                               boolean[] readCols) throws IOException {
+    // Use the cached objects if the read request matches the cached request
+    if (stripeIndex == currentStripe
+            && (readCols == null || Arrays.equals(readCols, rowIndexColsToRead))) {
+      if (rowIndexColsToRead != null) {
+        return indexes;
+      } else {
+        return planner.readRowIndex(readCols, indexes);
+      }
     } else {
       StripePlanner copy = new StripePlanner(planner);
       if (included == null) {
         included = new boolean[schema.getMaximumId() + 1];
         Arrays.fill(included, true);
       }
       copy.parseStripe(stripes.get(stripeIndex), included);
-      return copy.readRowIndex(sargColumns, null);
+      return copy.readRowIndex(readCols, null);
     }
   }
 
-  private void seekToRowEntry(BatchReader reader, int rowEntry)
+  private void seekToRowEntry(BatchReader reader, int rowEntry, EnumSet<TypeReader.ReadLevel> readLevel)

Review comment:
       We already have readLevel as part of our RecordReaderIml instance, any reason we also pass it as param here?

##########
File path: java/mapreduce/src/java/org/apache/orc/mapred/OrcMapredRecordReader.java
##########
@@ -98,16 +101,22 @@ public boolean next(NullWritable key, V value) throws IOException {
     if (!ensureBatch()) {
       return false;
     }
+    int rowIdx = batch.selectedInUse ? batch.selected[rowInBatch] : rowInBatch;
     if (schema.getCategory() == TypeDescription.Category.STRUCT) {
       OrcStruct result = (OrcStruct) value;
       List<TypeDescription> children = schema.getChildren();
       int numberOfChildren = children.size();
       for(int i=0; i < numberOfChildren; ++i) {
-        result.setFieldValue(i, nextValue(batch.cols[i], rowInBatch,
-            children.get(i), result.getFieldValue(i)));
+        TypeDescription child = children.get(i);

Review comment:
       unrelated change?

##########
File path: java/core/src/java/org/apache/orc/impl/reader/tree/TypeReader.java
##########
@@ -24,22 +25,51 @@
 import org.apache.orc.impl.reader.StripePlanner;
 
 import java.io.IOException;
+import java.util.EnumSet;
 
 public interface TypeReader {
   void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException;
 
-  void startStripe(StripePlanner planner) throws IOException;
+  void startStripe(StripePlanner planner, EnumSet<ReadLevel> readLevel) throws IOException;
 
-  void seek(PositionProvider[] index) throws IOException;
+  void seek(PositionProvider[] index, EnumSet<ReadLevel> readLevel) throws IOException;
 
-  void seek(PositionProvider index) throws IOException;
+  void seek(PositionProvider index, EnumSet<ReadLevel> readLevel) throws IOException;
 
-  void skipRows(long rows) throws IOException;
+  void skipRows(long rows, EnumSet<ReadLevel> readLevel) throws IOException;
 
   void nextVector(ColumnVector previous,
                   boolean[] isNull,
                   int batchSize,
-                  FilterContext filterContext) throws IOException;
+                  FilterContext filterContext,
+                  EnumSet<ReadLevel> readLevel) throws IOException;
 
   int getColumnId();
+
+  ReadLevel getReadLevel();
+
+  /**
+   * Determines if the child of the parent should be allowed based on the read level. The child
+   * is allowed based on the read level or if the child is a LEAD_PARENT, this allows the handling
+   * of FOLLOW children on the LEAD_PARENT
+   * @param reader the child reader that is being evaluated
+   * @param readLevel the requested read level
+   * @return true if allowed by read level or if it is a LEAD_PARENT otherwise false
+   */
+  static boolean allowChild(TypeReader reader, EnumSet<ReadLevel> readLevel) {
+    return readLevel.contains(reader.getReadLevel())
+           || reader.getReadLevel() == ReadLevel.LEAD_PARENT;
+  }
+
+  enum ReadLevel {
+    LEAD_CHILD,    // Read only the elementary filter columns

Review comment:
       Do we really need to distinguish between LEAD_CHILD and LEAD_PARENT? 
   Why not just LEAD and FOLLOW?

##########
File path: java/mapreduce/src/java/org/apache/orc/mapreduce/OrcMapreduceRecordReader.java
##########
@@ -96,16 +99,22 @@ public boolean nextKeyValue() throws IOException, InterruptedException {
     if (!ensureBatch()) {
       return false;
     }
+    int rowIdx = batch.selectedInUse ? batch.selected[rowInBatch] : rowInBatch;
     if (schema.getCategory() == TypeDescription.Category.STRUCT) {
       OrcStruct result = (OrcStruct) row;
       List<TypeDescription> children = schema.getChildren();
       int numberOfChildren = children.size();
       for(int i=0; i < numberOfChildren; ++i) {
-        result.setFieldValue(i, OrcMapredRecordReader.nextValue(batch.cols[i], rowInBatch,
-            children.get(i), result.getFieldValue(i)));
+        TypeDescription child = children.get(i);

Review comment:
       unrelated change?

##########
File path: java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java
##########
@@ -1732,10 +1732,10 @@ public void testTypeConversion() throws IOException {
         dataReader, OrcFile.WriterVersion.ORC_14, true, Integer.MAX_VALUE);
     boolean[] columns = new boolean[]{true, true, true};
     planner.parseStripe(dataReader.getStripe(0), columns)
-        .readData(null, null, false);
-    reader.startStripe(planner);
+        .readData(null, null, false, TypeReader.ReadLevel.ALL);
+    reader.startStripe(planner, TypeReader.ReadLevel.ALL);
     VectorizedRowBatch batch = readType.createRowBatch();
-    reader.nextBatch(batch, 10);
+    reader.nextBatch(batch, 10, TypeReader.ReadLevel.ALL);

Review comment:
       Use overloaded method for all and create a separate test for partial?

##########
File path: java/core/src/java/org/apache/orc/impl/reader/StripePlanner.java
##########
@@ -144,10 +165,23 @@ public StripePlanner parseStripe(StripeInformation stripe,
    * @return the buffers that were read
    */
   public BufferChunkList readData(OrcIndex index,
-                                boolean[] rowGroupInclude,
-                                boolean forceDirect) throws IOException {
+                                  boolean[] rowGroupInclude,

Review comment:
       Update doc here? how readLevel is changing the behaviour of the method?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org