You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by GitBox <gi...@apache.org> on 2021/05/28 04:27:57 UTC

[GitHub] [incubator-pinot] mqliang commented on a change in pull request #6969: Fix partial comparison for realtime table and allow keep cluster on failure

mqliang commented on a change in pull request #6969:
URL: https://github.com/apache/incubator-pinot/pull/6969#discussion_r641259035



##########
File path: pinot-integration-tests/src/test/java/org/apache/pinot/compat/tests/SqlResultComparator.java
##########
@@ -113,9 +126,147 @@ public static boolean areEqual(JsonNode actual, JsonNode expected, String query)
     if (expected.has(FIELD_IS_SUPERSET) && expected.get(FIELD_IS_SUPERSET).asBoolean(false)) {
       return areElementsSubset(actualElementsSerialized, expectedElementsSerialized);
     } else {
-      return areLengthsEqual(actual, expected) && areElementsEqual(actualElementsSerialized, expectedElementsSerialized,
-          query) && areMetadataEqual(actual, expected);
+      if (!areLengthsEqual(actual, expected)) {
+        return false;
+      }
+      /*
+       * Pinot server do some early termination optimization (process in parallel and early return if get enough
+       * documents to fulfill the LIMIT and OFFSET requirement) for queries:
+       * - selection without order by
+       * - selection with order by (by sorting the segments on min-max value)
+       * - DISTINCT queries.
+       * numDocsScanned is no-deterministic for those queries so numDocsScanned comparison should be skipped.
+       *

Review comment:
       done

##########
File path: pinot-integration-tests/src/test/java/org/apache/pinot/compat/tests/SqlResultComparator.java
##########
@@ -113,9 +126,147 @@ public static boolean areEqual(JsonNode actual, JsonNode expected, String query)
     if (expected.has(FIELD_IS_SUPERSET) && expected.get(FIELD_IS_SUPERSET).asBoolean(false)) {
       return areElementsSubset(actualElementsSerialized, expectedElementsSerialized);
     } else {
-      return areLengthsEqual(actual, expected) && areElementsEqual(actualElementsSerialized, expectedElementsSerialized,
-          query) && areMetadataEqual(actual, expected);
+      if (!areLengthsEqual(actual, expected)) {
+        return false;
+      }
+      /*
+       * Pinot server do some early termination optimization (process in parallel and early return if get enough
+       * documents to fulfill the LIMIT and OFFSET requirement) for queries:
+       * - selection without order by
+       * - selection with order by (by sorting the segments on min-max value)
+       * - DISTINCT queries.
+       * numDocsScanned is no-deterministic for those queries so numDocsScanned comparison should be skipped.
+       *
+       * NOTE: DISTINCT queries are modeled as non-selection queries during query processing, but all DISTINCT
+       * queries are selection queries during Calcite parsing (DISTINCT queries are selection queries with
+       * selectNode.getModifierNode(SqlSelectKeyword.DISTINCT) != null).
+       */
+      if (!isSelectionQuery(query) && !areNumDocsScannedEqual(actual, expected)) {
+        return false;
+      }
+      return isOrderByQuery(query) ? areOrderByQueryElementsEqual(actualRows, expectedRows, actualElementsSerialized,
+          expectedElementsSerialized, query)
+          : areNonOrderByQueryElementsEqual(actualElementsSerialized, expectedElementsSerialized);
+    }
+  }
+
+  private static boolean areOrderByQueryElementsEqual(ArrayNode actualElements, ArrayNode expectedElements,
+      List<String> actualElementsSerialized, List<String> expectedElementsSerialized, String query) {
+    // Happy path, the results match exactly.
+    if (actualElementsSerialized.equals(expectedElementsSerialized)) {
+      LOGGER.debug("The results of the ordered query match exactly!");
+      return true;
+    }
+
+    /*
+     * Unhappy path, it possible that the returned results:
+     * - are not ordered by all columns.
+     * - to be a subset of total qualified results.
+     * In this case, we divide the results into groups (based on the ordered by column values), then compare the actual
+     * results and expected results group by group:
+     * - ordered by column values should be the same.
+     * - other column values (columns not in the order-by list) should be in the same set.
+     * - for the last group, since it's possible that the returned results to be a subset of total qualified results,
+     *   skipping the value comparison for other columns.
+     *
+     * Let's say we have a table:
+     * column_name: A, B, C
+     * row 0 value: 1, 2, 3
+     * row 1 value: 2, 2, 4
+     * row 2 value: 3, 7, 5
+     * row 3 value: 3, 2, 3
+     * row 4 value: 4, 2, 5
+     * row 5 value: 4, 7, 6
+     *
+     * There are 4 possible result for query `SELECT * from table ordered by A LIMIT 5`:
+     * 1) [row 0, row 1, row 2, row 3, row 4]
+     * 2) [row 0, row 1, row 3, row 2, row 4]
+     * 3) [row 0, row 1, row 2, row 3, row 5]
+     * 4) [row 0, row 1, row 3, row 2, row 5]
+     *
+     * So we will divide the result into 4 groups (based on value of A):
+     * group 1: [row 0]
+     * group 2: [row 1]
+     * group 3: [row 2, row 3], or [row 3, row 2]
+     * group 4: [row 4] or [row 5]
+     *
+     * During comparison:
+     * - for group 1, 2, 3: value of A should be the same, value of (B, C) should be in the same set.
+     * - for group 4: only verify the value of A should be the same.
+     */
+
+    List<Integer> orderByColumnIndexs = getOrderByColumnIndexs(query);
+    LinkedHashMap<String, List<String>> actualOrderByColumnValuesToOtherColumnValuesMap = new LinkedHashMap<>();
+    LinkedHashMap<String, List<String>> expectedOrderByColumnValuesToOtherColumnValuesMap = new LinkedHashMap<>();
+    String lastGroupOrderByColumnValues = "";
+    for (int i = 0; i < actualElements.size(); i++) {
+      String actualOrderByColumnValues = "";
+      String expectedOrderByColumnValues = "";
+      String actualOtherColumnValues = "";
+      String expectOtherColumnValues = "";
+      ArrayNode actualValue = (ArrayNode) actualElements.get(i).get(FIELD_VALUE);
+      ArrayNode expectedValue = (ArrayNode) expectedElements.get(i).get(FIELD_VALUE);
+
+      for (int j = 0; j < actualValue.size(); j++) {
+        if (orderByColumnIndexs.contains(j)) {
+          actualOrderByColumnValues += ", " + actualValue.get(j).toString();
+          expectedOrderByColumnValues += ", " + expectedValue.get(j).toString();
+        } else {
+          actualOtherColumnValues += ", " + actualValue.get(j).toString();
+          expectOtherColumnValues += ", " + expectedValue.get(j).toString();
+        }
+      }
+      lastGroupOrderByColumnValues = actualOrderByColumnValues;
+
+      actualOrderByColumnValuesToOtherColumnValuesMap.
+          computeIfAbsent(actualOrderByColumnValues, k -> new LinkedList<>()).
+          add(actualOtherColumnValues);
+      expectedOrderByColumnValuesToOtherColumnValuesMap.
+          computeIfAbsent(expectedOrderByColumnValues, k -> new LinkedList<>()).
+          add(expectOtherColumnValues);
+    }
+
+    if (!actualOrderByColumnValuesToOtherColumnValuesMap.keySet().
+        equals(expectedOrderByColumnValuesToOtherColumnValuesMap.keySet())) {
+      LOGGER.error("The results of the ordered query has different groups, actual: {}, expected: {}",
+          actualOrderByColumnValuesToOtherColumnValuesMap.keySet(),
+          expectedOrderByColumnValuesToOtherColumnValuesMap.keySet());
+      return false;
+    }
+
+    for (Map.Entry<String, List<String>> entry : actualOrderByColumnValuesToOtherColumnValuesMap.entrySet()) {
+      String orderByColumnValues = entry.getKey();
+      // For the last group, skip the value comparison for other columns.
+      if (orderByColumnValues.equals(lastGroupOrderByColumnValues)) {
+        continue;
+      }
+      List<String> actualOtherColumnValues = entry.getValue();
+      List<String> expectedOtherColumnValues =
+          expectedOrderByColumnValuesToOtherColumnValuesMap.get(orderByColumnValues);
+      Collections.sort(actualOtherColumnValues);
+      Collections.sort(expectedOtherColumnValues);
+      if (!actualOtherColumnValues.equals(expectedOtherColumnValues)) {
+        LOGGER.error(
+            "The results of the ordered query has different non-order-by column values for group: {}, actual: {}, expected: {}",
+            orderByColumnValues, actualOtherColumnValues, expectedOtherColumnValues);
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  private static boolean areNonOrderByQueryElementsEqual(List<String> actualElementsSerialized,
+      List<String> expectedElementsSerialized) {
+    // sort elements
+    actualElementsSerialized.sort(null);
+    expectedElementsSerialized.sort(null);
+    if (!actualElementsSerialized.equals(expectedElementsSerialized)) {

Review comment:
       done

##########
File path: compatibility-verifier/README.md
##########
@@ -32,7 +32,13 @@ dir and output the path, which can be used in step 2.
 
 ### Step 2: run compatibility regression test against the two targets build in step1
 ```shell
-./compatibility-verifier/compCheck.sh [workingDir]
+./compCheck.sh -h
+Usage:  -w <workingDir> -t <testSuiteDir> -k true/false

Review comment:
       done

##########
File path: compatibility-verifier/compCheck.sh
##########
@@ -41,20 +41,54 @@
 
 RM="/bin/rm"
 logCount=1
+#Declare the number of mandatory args
+margs=2
 
 # get usage of the script
 function usage() {
   command=$1
-  echo "Usage: $command workingDir testSuiteDir"
-  exit 1
+  echo "Usage: $command -w <workingDir> -t <testSuiteDir> -k true/false"
+}
+
+function help() {
+  usage
+  echo -e "MANDATORY:"
+  echo -e "  -w, --working-dir                      Working directory where olderCommit and newCommit target files reside."
+  echo -e "  -t, --test-suite-dir                   Test suite directory\n"
+  echo -e "OPTIONAL:"
+  echo -e "  -k, --keep-cluster-on-failure          Whether keep cluster on test failure (default: false)"

Review comment:
       done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org