You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by "vvivekiyer (via GitHub)" <gi...@apache.org> on 2023/02/28 02:41:20 UTC
[GitHub] [pinot] vvivekiyer commented on a diff in pull request #10286: [multistage] Initial (phase 1) Query runtime for window functions - empty OVER() and OVER(PARTITION BY)

vvivekiyer commented on code in PR #10286:
URL: https://github.com/apache/pinot/pull/10286#discussion_r1119474659


##########
pinot-query-planner/src/main/java/org/apache/pinot/query/planner/stage/WindowNode.java:
##########
@@ -20,6 +20,7 @@
 
 import com.clearspring.analytics.util.Preconditions;
 import java.util.ArrayList;
+import java.util.Collections;

Review Comment:
   Related to planner PR:
   Is there value is repeating the `validateFrameBounds()` checks here? We have a similar check in [PinotWindowExchangeNodeInsertRule.java](https://github.com/apache/pinot/pull/10286/files#diff-347310fedfd85bcf7bb0c4a5aac5f671760f6c0f3f4a65352648499aaa5eb0b0)



##########
pinot-query-runtime/src/main/java/org/apache/pinot/query/runtime/operator/WindowAggregateOperator.java:
##########
@@ -0,0 +1,343 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.query.runtime.operator;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import javax.annotation.Nullable;
+import org.apache.calcite.rel.RelFieldCollation;
+import org.apache.commons.collections.CollectionUtils;
+import org.apache.pinot.common.datablock.DataBlock;
+import org.apache.pinot.common.utils.DataSchema;
+import org.apache.pinot.core.data.table.Key;
+import org.apache.pinot.query.planner.logical.RexExpression;
+import org.apache.pinot.query.routing.VirtualServerAddress;
+import org.apache.pinot.query.runtime.blocks.TransferableBlock;
+import org.apache.pinot.query.runtime.blocks.TransferableBlockUtils;
+import org.apache.pinot.query.runtime.operator.utils.AggregationUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * The WindowAggregateOperator is used to compute window function aggregations over a set of optional
+ * PARTITION BY keys, ORDER BY keys and a FRAME clause. The output data will include the projected
+ * columns and in addition will add the aggregation columns to the output data.
+ * [input columns, aggregate result1, ... aggregate resultN]
+ *
+ * The window functions supported today are SUM/COUNT/MIN/MAX aggregations. Window functions also include
+ * other types of functions such as rank and value functions.
+ *
+ * Unlike the AggregateOperator which will output one row per group, the WindowAggregateOperator
+ * will output as many rows as input rows.
+ *
+ * Note: This class performs aggregation over the double value of input.
+ * If the input is single value, the output type will be input type. Otherwise, the output type will be double.
+ *
+ * TODO:
+ *     1. Add support for OVER() clause with ORDER BY only or PARTITION BY ORDER BY
+ *     2. Add support for rank window functions
+ *     3. Add support for value window functions
+ *     4. Add support for custom frames
+ */
+public class WindowAggregateOperator extends MultiStageOperator {
+  private static final String EXPLAIN_NAME = "WINDOW";
+  private static final Logger LOGGER = LoggerFactory.getLogger(WindowAggregateOperator.class);
+
+  private final MultiStageOperator _inputOperator;
+  private final List<RexExpression> _groupSet;
+  private final OrderSetInfo _orderSetInfo;
+  private final WindowFrame _windowFrame;
+  private final List<RexExpression.FunctionCall> _aggCalls;
+  private final List<RexExpression> _constants;
+  private final DataSchema _resultSchema;
+  private final AggregationUtils.Accumulator[] _windowAccumulators;
+  private final Map<Key, List<Object[]>> _partitionRows;
+
+  private TransferableBlock _upstreamErrorBlock;
+
+  private int _numRows;
+  private boolean _readyToConstruct;
+  private boolean _hasReturnedWindowAggregateBlock;
+
+  public WindowAggregateOperator(MultiStageOperator inputOperator, List<RexExpression> groupSet,
+      List<RexExpression> orderSet, List<RelFieldCollation.Direction> orderSetDirection,
+      List<RelFieldCollation.NullDirection> orderSetNullDirection, List<RexExpression> aggCalls, int lowerBound,
+      int upperBound, boolean isRows, List<RexExpression> constants, DataSchema resultSchema, DataSchema inputSchema,
+      long requestId, int stageId, VirtualServerAddress virtualServerAddress) {
+    this(inputOperator, groupSet, orderSet, orderSetDirection, orderSetNullDirection, aggCalls, lowerBound,
+        upperBound, isRows, constants, resultSchema, inputSchema, AggregationUtils.Accumulator.MERGERS,
+        requestId, stageId, virtualServerAddress);
+  }
+
+  @VisibleForTesting
+  public WindowAggregateOperator(MultiStageOperator inputOperator, List<RexExpression> groupSet,
+      List<RexExpression> orderSet, List<RelFieldCollation.Direction> orderSetDirection,
+      List<RelFieldCollation.NullDirection> orderSetNullDirection, List<RexExpression> aggCalls, int lowerBound,
+      int upperBound, boolean isRows, List<RexExpression> constants, DataSchema resultSchema, DataSchema inputSchema,
+      Map<String, Function<DataSchema.ColumnDataType, AggregationUtils.Merger>> mergers, long requestId, int stageId,
+      VirtualServerAddress virtualServerAddress) {
+    super(requestId, stageId, virtualServerAddress);
+
+    boolean isPartitionByOnly = isPartitionByOnlyQuery(groupSet, orderSet, orderSetDirection, orderSetNullDirection);
+    // TODO: add support for ORDER BY in the OVER() clause
+    Preconditions.checkState(orderSet == null || orderSet.isEmpty() || isPartitionByOnly,
+        "Order by is not yet supported in window functions");
+
+    _inputOperator = inputOperator;
+    _groupSet = groupSet;
+    _orderSetInfo = new OrderSetInfo(orderSet, orderSetDirection, orderSetNullDirection);
+    _windowFrame = new WindowFrame(lowerBound, upperBound, isRows);
+
+    // TODO: add support for custom frames, and for ORDER BY default frame (upperBound => currentRow)
+    Preconditions.checkState(!_windowFrame.isRows(), "Only RANGE type frames are supported at present");
+    Preconditions.checkState(_windowFrame.isUnboundedPreceding(),
+        "Only default frame is supported, lowerBound must be UNBOUNDED PRECEDING");
+    Preconditions.checkState(_windowFrame.isUnboundedFollowing()
+            || (_windowFrame.isUpperBoundCurrentRow() && isPartitionByOnly),
+        "Only default frame is supported, upperBound must be UNBOUNDED FOLLOWING or CURRENT ROW");
+
+    // we expect all agg calls to be aggregate function calls
+    _aggCalls = aggCalls.stream().map(RexExpression.FunctionCall.class::cast).collect(Collectors.toList());
+    _constants = constants;
+    _resultSchema = resultSchema;
+
+    // TODO: Not all window functions (e.g. ROW_NUMBER, LAG, etc) need aggregations. Such functions should be handled
+    //       differently.
+    _windowAccumulators = new AggregationUtils.Accumulator[_aggCalls.size()];
+    for (int i = 0; i < _aggCalls.size(); i++) {
+      RexExpression.FunctionCall agg = _aggCalls.get(i);
+      String functionName = agg.getFunctionName();
+      if (!mergers.containsKey(functionName)) {
+        throw new IllegalStateException("Unexpected value: " + functionName);
+      }
+      _windowAccumulators[i] = new AggregationUtils.Accumulator(agg, mergers, functionName, inputSchema);
+    }
+
+    _partitionRows = new HashMap<>();
+
+    _numRows = 0;
+    _readyToConstruct = false;
+    _hasReturnedWindowAggregateBlock = false;
+  }
+
+  @Override
+  public List<MultiStageOperator> getChildOperators() {
+    return ImmutableList.of(_inputOperator);
+  }
+
+  @Nullable
+  @Override
+  public String toExplainString() {
+    return EXPLAIN_NAME;
+  }
+
+  @Override
+  protected TransferableBlock getNextBlock() {
+    try {
+      if (!_readyToConstruct && !consumeInputBlocks()) {
+        return TransferableBlockUtils.getNoOpTransferableBlock();
+      }
+
+      if (_upstreamErrorBlock != null) {
+        return _upstreamErrorBlock;
+      }
+
+      if (!_hasReturnedWindowAggregateBlock) {
+        return produceWindowAggregateBlock();
+      } else {
+        // TODO: Move to close call.
+        return TransferableBlockUtils.getEndOfStreamTransferableBlock();
+      }
+    } catch (Exception e) {
+      LOGGER.error("Caught exception while executing WindowAggregationOperator, returning an error block", e);
+      return TransferableBlockUtils.getErrorTransferableBlock(e);
+    }
+  }
+
+  private boolean isPartitionByOnlyQuery(List<RexExpression> groupSet, List<RexExpression> orderSet,
+      List<RelFieldCollation.Direction> orderSetDirection,
+      List<RelFieldCollation.NullDirection> orderSetNullDirection) {
+    if (CollectionUtils.isEmpty(orderSet)) {
+      return true;
+    }
+
+    if (CollectionUtils.isEmpty(groupSet) || (groupSet.size() != orderSet.size())) {

Review Comment:
   In Phase1 (not specific to this PR), are we going to support queries where there are multiple window functions but one function has an ORDER BY and the other doesn't? I'm assuming it fails the criteria - "All order by keys should be on a single column" and Calcite will generate 2 window groups. Is that correct?
   
   `SELECT SUM(col1) OVER(PARTITION BY key1), SUM(col2) OVER(PARTITION BY key1 ORDER BY col1) from table t1`
   
   Either way, If we don't have a test for this case, can we please add them
   



##########
pinot-query-planner/src/main/java/org/apache/calcite/rel/rules/PinotWindowExchangeNodeInsertRule.java:
##########
@@ -160,6 +162,19 @@ private boolean isPartitionByOnlyQuery(Window.Group windowGroup) {
       Set<Integer> partitionByKeyList = new HashSet<>(windowGroup.keys.toList());
       Set<Integer> orderByKeyList = new HashSet<>(windowGroup.orderKeys.getKeys());
       isPartitionByOnly = partitionByKeyList.equals(orderByKeyList);
+      if (isPartitionByOnly) {

Review Comment:
   +1 to removing this because:
   1. We use isPartitionByOnlyQuery to decide if we have to use LogicalExchange vs LogicalSortExchange
   2. Based on my understanding, no sorting of rows within a partition is needed if the partition key and order by key are the same. 



##########
pinot-query-runtime/src/main/java/org/apache/pinot/query/runtime/operator/WindowAggregateOperator.java:
##########
@@ -0,0 +1,343 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.query.runtime.operator;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import javax.annotation.Nullable;
+import org.apache.calcite.rel.RelFieldCollation;
+import org.apache.commons.collections.CollectionUtils;
+import org.apache.pinot.common.datablock.DataBlock;
+import org.apache.pinot.common.utils.DataSchema;
+import org.apache.pinot.core.data.table.Key;
+import org.apache.pinot.query.planner.logical.RexExpression;
+import org.apache.pinot.query.routing.VirtualServerAddress;
+import org.apache.pinot.query.runtime.blocks.TransferableBlock;
+import org.apache.pinot.query.runtime.blocks.TransferableBlockUtils;
+import org.apache.pinot.query.runtime.operator.utils.AggregationUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * The WindowAggregateOperator is used to compute window function aggregations over a set of optional
+ * PARTITION BY keys, ORDER BY keys and a FRAME clause. The output data will include the projected
+ * columns and in addition will add the aggregation columns to the output data.
+ * [input columns, aggregate result1, ... aggregate resultN]
+ *
+ * The window functions supported today are SUM/COUNT/MIN/MAX aggregations. Window functions also include

Review Comment:
   We also support AVG by virtue of SUM and COUNT right?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org