You are viewing a plain text version of this content. The canonical link for it is here.
Posted to gitbox@hive.apache.org by GitBox <gi...@apache.org> on 2021/07/30 11:51:03 UTC

[GitHub] [hive] pgaref commented on a change in pull request #2225: HIVE-25061: PTF: Improve ValueBoundaryScanner

pgaref commented on a change in pull request #2225:
URL: https://github.com/apache/hive/pull/2225#discussion_r679833993



##########
File path: ql/src/java/org/apache/hadoop/hive/ql/udf/ptf/ValueBoundaryScanner.java
##########
@@ -403,26 +418,13 @@ protected int computeStartPreceding(int rowIdx, PTFPartition p) throws HiveExcep
       }
     }
 
-    Object rowVal = sortKey;
     int r = rowIdx;
 
-    // Use Case 4.
-    if ( expressionDef.getOrder() == Order.DESC ) {
-      while (r >= 0 && !isDistanceGreater(rowVal, sortKey, amt) ) {
-        Pair<Integer, Object> stepResult = skipOrStepBack(r, p);
-        r = stepResult.getLeft();
-        rowVal = stepResult.getRight();
-      }
-      return r + 1;
-    }
-    else { // Use Case 5.
-      while (r >= 0 && !isDistanceGreater(sortKey, rowVal, amt) ) {
-        Pair<Integer, Object> stepResult = skipOrStepBack(r, p);
-        r = stepResult.getLeft();
-        rowVal = stepResult.getRight();
-      }
-
-      return r + 1;
+    // Use Case 4,5
+    if (enableBinarySearch) {
+      return binarySearchBack(r, p, sortKey, amt, expressionDef.getOrder());

Review comment:
       I would pass rowId directly here for clarity

##########
File path: ql/src/java/org/apache/hadoop/hive/ql/udf/ptf/ValueBoundaryScanner.java
##########
@@ -664,6 +632,87 @@ public Object computeValue(Object row) throws HiveException {
    */
   public abstract boolean isEqual(Object v1, Object v2);
 
+  protected int binarySearchBack(int r, PTFPartition p, Object sortKey, int amt,
+      Order order) throws HiveException {
+    boolean isOrderDesc = order.equals(Order.DESC);
+    Object rowVal = sortKey;
+
+    int rMin = -1;  // tracks lowest possible number fulfilling the range requirement
+    int rMax = r; // tracks highest possible number fulfilling the range requirement
+
+    boolean isDistanceGreater = true;
+    while (true) {

Review comment:
       I usually avoid while true loops -- would refactor this to rMin < rMax and return rMin outside the loop

##########
File path: ql/src/java/org/apache/hadoop/hive/ql/udf/ptf/ValueBoundaryScanner.java
##########
@@ -664,6 +632,87 @@ public Object computeValue(Object row) throws HiveException {
    */
   public abstract boolean isEqual(Object v1, Object v2);
 
+  protected int binarySearchBack(int r, PTFPartition p, Object sortKey, int amt,
+      Order order) throws HiveException {
+    boolean isOrderDesc = order.equals(Order.DESC);
+    Object rowVal = sortKey;
+
+    int rMin = -1;  // tracks lowest possible number fulfilling the range requirement
+    int rMax = r; // tracks highest possible number fulfilling the range requirement
+
+    boolean isDistanceGreater = true;
+    while (true) {
+      if (rMin == rMax) {
+        return rMin;
+      }
+      isDistanceGreater =
+          isDistanceGreater(isOrderDesc ? rowVal : sortKey, isOrderDesc ? sortKey : rowVal, amt);
+      if (!isDistanceGreater && r == 0) {
+        return 0;
+      }
+      if (isDistanceGreater) {
+        rMin = r + 1;
+      } else {
+        rMax = r;
+      }
+      r = rMin + (rMax - rMin) / 2;
+      rowVal = computeValueUseCache(r, p);
+    }
+  }
+
+  private int linearSearchBack(int r, PTFPartition p, Object sortKey, int amt,
+      Order order) throws HiveException {
+    boolean isOrderDesc = order.equals(Order.DESC);
+    Object rowVal = sortKey;
+    while (r >= 0 && !isDistanceGreater(isOrderDesc ? rowVal : sortKey,
+        isOrderDesc ? sortKey : rowVal, amt)) {
+      Pair<Integer, Object> stepResult = skipOrStepBack(r, p);
+      r = stepResult.getLeft();
+      rowVal = stepResult.getRight();
+    }
+    return r + 1;
+  }
+
+  protected int binarySearchForward(int r, PTFPartition p, Object sortKey, int amt,
+      Order order) throws HiveException {
+    boolean isOrderDesc = order.equals(Order.DESC);
+    Object rowVal = sortKey;
+    int rMin = r;  // tracks lowest possible number fulfilling the range requirement
+    int rMax = p.size(); // tracks highest possible number fulfilling the range requirement

Review comment:
       maybe rMax = p.size()-1 ?

##########
File path: ql/src/java/org/apache/hadoop/hive/ql/udf/ptf/ValueBoundaryScanner.java
##########
@@ -664,6 +632,87 @@ public Object computeValue(Object row) throws HiveException {
    */
   public abstract boolean isEqual(Object v1, Object v2);
 
+  protected int binarySearchBack(int r, PTFPartition p, Object sortKey, int amt,
+      Order order) throws HiveException {
+    boolean isOrderDesc = order.equals(Order.DESC);
+    Object rowVal = sortKey;
+
+    int rMin = -1;  // tracks lowest possible number fulfilling the range requirement

Review comment:
       maybe start with rMin=0?

##########
File path: ql/src/java/org/apache/hadoop/hive/ql/udf/ptf/ValueBoundaryScanner.java
##########
@@ -664,6 +632,87 @@ public Object computeValue(Object row) throws HiveException {
    */
   public abstract boolean isEqual(Object v1, Object v2);
 
+  protected int binarySearchBack(int r, PTFPartition p, Object sortKey, int amt,

Review comment:
       maybe rename r to something like mid or rowId?

##########
File path: ql/src/java/org/apache/hadoop/hive/ql/udf/ptf/ValueBoundaryScanner.java
##########
@@ -664,6 +632,87 @@ public Object computeValue(Object row) throws HiveException {
    */
   public abstract boolean isEqual(Object v1, Object v2);
 
+  protected int binarySearchBack(int r, PTFPartition p, Object sortKey, int amt,
+      Order order) throws HiveException {
+    boolean isOrderDesc = order.equals(Order.DESC);
+    Object rowVal = sortKey;
+
+    int rMin = -1;  // tracks lowest possible number fulfilling the range requirement
+    int rMax = r; // tracks highest possible number fulfilling the range requirement
+
+    boolean isDistanceGreater = true;
+    while (true) {
+      if (rMin == rMax) {
+        return rMin;
+      }
+      isDistanceGreater =
+          isDistanceGreater(isOrderDesc ? rowVal : sortKey, isOrderDesc ? sortKey : rowVal, amt);
+      if (!isDistanceGreater && r == 0) {

Review comment:
       Looks to me that is !isDistanceGreater and r/mid ==0 -> rMax will become rMin and will terminate in next iteration.
   Any particular reason we add this here?

##########
File path: ql/src/java/org/apache/hadoop/hive/ql/udf/ptf/ValueBoundaryScanner.java
##########
@@ -664,6 +632,87 @@ public Object computeValue(Object row) throws HiveException {
    */
   public abstract boolean isEqual(Object v1, Object v2);
 
+  protected int binarySearchBack(int r, PTFPartition p, Object sortKey, int amt,
+      Order order) throws HiveException {
+    boolean isOrderDesc = order.equals(Order.DESC);
+    Object rowVal = sortKey;
+
+    int rMin = -1;  // tracks lowest possible number fulfilling the range requirement
+    int rMax = r; // tracks highest possible number fulfilling the range requirement
+
+    boolean isDistanceGreater = true;
+    while (true) {
+      if (rMin == rMax) {
+        return rMin;
+      }
+      isDistanceGreater =
+          isDistanceGreater(isOrderDesc ? rowVal : sortKey, isOrderDesc ? sortKey : rowVal, amt);
+      if (!isDistanceGreater && r == 0) {
+        return 0;
+      }
+      if (isDistanceGreater) {
+        rMin = r + 1;
+      } else {
+        rMax = r;
+      }
+      r = rMin + (rMax - rMin) / 2;
+      rowVal = computeValueUseCache(r, p);
+    }
+  }
+
+  private int linearSearchBack(int r, PTFPartition p, Object sortKey, int amt,
+      Order order) throws HiveException {
+    boolean isOrderDesc = order.equals(Order.DESC);
+    Object rowVal = sortKey;
+    while (r >= 0 && !isDistanceGreater(isOrderDesc ? rowVal : sortKey,
+        isOrderDesc ? sortKey : rowVal, amt)) {
+      Pair<Integer, Object> stepResult = skipOrStepBack(r, p);
+      r = stepResult.getLeft();
+      rowVal = stepResult.getRight();
+    }
+    return r + 1;
+  }
+
+  protected int binarySearchForward(int r, PTFPartition p, Object sortKey, int amt,
+      Order order) throws HiveException {
+    boolean isOrderDesc = order.equals(Order.DESC);
+    Object rowVal = sortKey;
+    int rMin = r;  // tracks lowest possible number fulfilling the range requirement
+    int rMax = p.size(); // tracks highest possible number fulfilling the range requirement
+
+    boolean isDistanceGreater = true;
+    while (true) {
+      if (rMin == rMax) {
+        return rMin;
+      }
+      isDistanceGreater =
+          isDistanceGreater(isOrderDesc ? sortKey : rowVal, isOrderDesc ? rowVal : sortKey, amt);
+      if (!isDistanceGreater && r == p.size() - 1) {

Review comment:
       Just wondering -- p.size() is out of bounds right?

##########
File path: ql/src/java/org/apache/hadoop/hive/ql/udf/ptf/ValueBoundaryScanner.java
##########
@@ -664,6 +632,87 @@ public Object computeValue(Object row) throws HiveException {
    */
   public abstract boolean isEqual(Object v1, Object v2);
 
+  protected int binarySearchBack(int r, PTFPartition p, Object sortKey, int amt,
+      Order order) throws HiveException {
+    boolean isOrderDesc = order.equals(Order.DESC);
+    Object rowVal = sortKey;
+
+    int rMin = -1;  // tracks lowest possible number fulfilling the range requirement
+    int rMax = r; // tracks highest possible number fulfilling the range requirement
+
+    boolean isDistanceGreater = true;
+    while (true) {
+      if (rMin == rMax) {
+        return rMin;
+      }
+      isDistanceGreater =
+          isDistanceGreater(isOrderDesc ? rowVal : sortKey, isOrderDesc ? sortKey : rowVal, amt);
+      if (!isDistanceGreater && r == 0) {
+        return 0;
+      }
+      if (isDistanceGreater) {
+        rMin = r + 1;
+      } else {
+        rMax = r;
+      }
+      r = rMin + (rMax - rMin) / 2;

Review comment:
       lets rename r to mid -- since its BS 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: gitbox-unsubscribe@hive.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: gitbox-unsubscribe@hive.apache.org
For additional commands, e-mail: gitbox-help@hive.apache.org