You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@annotator.apache.org by ge...@apache.org on 2020/09/18 13:41:43 UTC

[incubator-annotator] 01/03: Change approach to choose prefix/suffix.

This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch faster-describeTextQuote
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git

commit 8823057780c8fbcb090a0633f43deee144da961d
Author: Gerben <ge...@treora.com>
AuthorDate: Fri Sep 18 12:55:32 2020 +0200

    Change approach to choose prefix/suffix.
    
    This makes it faster, but does not guarantee to find the minimum number
    of characters needed to disambiguate. I think it should be at most twice
    that minimum though.
    
    Note that the minimum is in any case not what one wants if the goal is
    to make the selector robust to small document changes. We should add a
    less stingy approach some day.
---
 packages/dom/src/text-quote/describe.ts | 71 +++++++++++----------------------
 1 file changed, 23 insertions(+), 48 deletions(-)

diff --git a/packages/dom/src/text-quote/describe.ts b/packages/dom/src/text-quote/describe.ts
index dca21c8..b599252 100644
--- a/packages/dom/src/text-quote/describe.ts
+++ b/packages/dom/src/text-quote/describe.ts
@@ -51,26 +51,24 @@ export async function describeTextQuote(
 function calculateContextForDisambiguation(
   range: Range,
   scope: Range,
-): { prefix?: string; suffix?: string } {
+): { prefix: string; suffix: string } {
   const exactText = range.toString();
   const scopeText = scope.toString();
   const targetStartIndex = getRangeTextPosition(range, scope);
   const targetEndIndex = targetStartIndex + exactText.length;
 
-  // Find all matches of the text in the scope.
-  const stringMatches: number[] = [];
+  // Starting with an empty prefix and suffix, we search for matches. At each unintended match
+  // we encounter, we extend the prefix or suffix just enough to ensure it will no longer match.
+  let prefix = '';
+  let suffix = '';
   let fromIndex = 0;
   while (fromIndex <= scopeText.length) {
-    const matchIndex = scopeText.indexOf(exactText, fromIndex);
-    if (matchIndex === -1) break;
-    stringMatches.push(matchIndex);
-    fromIndex = matchIndex + 1;
-  }
+    const searchPattern = prefix + exactText + suffix;
+    const patternMatchIndex = scopeText.indexOf(searchPattern, fromIndex);
+    if (patternMatchIndex === -1) break;
+    fromIndex = patternMatchIndex + 1;
 
-  // Count for each undesired match the required prefix and suffix lengths, such that either of them
-  // would have invalidated the match.
-  const affixLengthPairs: Array<[number, number]> = [];
-  for (const matchStartIndex of stringMatches) {
+    const matchStartIndex = patternMatchIndex + prefix.length;
     const matchEndIndex = matchStartIndex + exactText.length;
 
     // Skip the found match if it is the actual target.
@@ -87,20 +85,21 @@ function calculateContextForDisambiguation(
       scopeText.substring(matchEndIndex),
       false,
     );
-    affixLengthPairs.push([sufficientPrefixLength, sufficientSuffixLength]);
+
+    // Use either the prefix or suffix, whichever is shortest.
+    if (sufficientPrefixLength <= sufficientSuffixLength) {
+      prefix = scopeText.substring(
+        targetStartIndex - sufficientPrefixLength,
+        targetStartIndex,
+      );
+    } else {
+      suffix = scopeText.substring(
+        targetEndIndex,
+        targetEndIndex + sufficientSuffixLength,
+      );
+    }
   }
 
-  // Find the prefix and suffix that would invalidate all mismatches, using the minimal characters
-  // for prefix and suffix combined.
-  const [prefixLength, suffixLength] = minimalSolution(affixLengthPairs);
-  const prefix = scopeText.substring(
-    targetStartIndex - prefixLength,
-    targetStartIndex,
-  );
-  const suffix = scopeText.substring(
-    targetEndIndex,
-    targetEndIndex + suffixLength,
-  );
   return { prefix, suffix };
 }
 
@@ -123,30 +122,6 @@ function charactersNeededToBeUnique(
   else return overlap + 1;
 }
 
-function minimalSolution(
-  requirements: Array<[number, number]>,
-): [number, number] {
-  // Ensure we try solutions with an empty prefix or suffix.
-  requirements.push([0, 0]);
-
-  // Build all the pairs and order them by their sums.
-  const pairs = requirements.flatMap((l) =>
-    requirements.map<[number, number]>((r) => [l[0], r[1]]),
-  );
-  pairs.sort((a, b) => a[0] + a[1] - (b[0] + b[1]));
-
-  // Find the first pair that satisfies every requirement.
-  for (const pair of pairs) {
-    const [p0, p1] = pair;
-    if (requirements.every(([r0, r1]) => r0 <= p0 || r1 <= p1)) {
-      return pair;
-    }
-  }
-
-  // Return the largest pairing (unreachable).
-  return pairs[pairs.length - 1];
-}
-
 // Get the index of the first character of range within the text of scope.
 function getRangeTextPosition(range: Range, scope: Range): number {
   const iter = ownerDocument(scope).createNodeIterator(