You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@annotator.apache.org by ge...@apache.org on 2020/09/18 13:41:42 UTC

[incubator-annotator] branch faster-describeTextQuote created (now 486c59f)

This is an automated email from the ASF dual-hosted git repository.

gerben pushed a change to branch faster-describeTextQuote
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git.


      at 486c59f  Add test, fix bug

This branch includes the following new commits:

     new 8823057  Change approach to choose prefix/suffix.
     new 35d3ebe  More performance tweaking
     new 486c59f  Add test, fix bug

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[incubator-annotator] 01/03: Change approach to choose prefix/suffix.

Posted by ge...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch faster-describeTextQuote
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git

commit 8823057780c8fbcb090a0633f43deee144da961d
Author: Gerben <ge...@treora.com>
AuthorDate: Fri Sep 18 12:55:32 2020 +0200

    Change approach to choose prefix/suffix.
    
    This makes it faster, but does not guarantee to find the minimum number
    of characters needed to disambiguate. I think it should be at most twice
    that minimum though.
    
    Note that the minimum is in any case not what one wants if the goal is
    to make the selector robust to small document changes. We should add a
    less stingy approach some day.
---
 packages/dom/src/text-quote/describe.ts | 71 +++++++++++----------------------
 1 file changed, 23 insertions(+), 48 deletions(-)

diff --git a/packages/dom/src/text-quote/describe.ts b/packages/dom/src/text-quote/describe.ts
index dca21c8..b599252 100644
--- a/packages/dom/src/text-quote/describe.ts
+++ b/packages/dom/src/text-quote/describe.ts
@@ -51,26 +51,24 @@ export async function describeTextQuote(
 function calculateContextForDisambiguation(
   range: Range,
   scope: Range,
-): { prefix?: string; suffix?: string } {
+): { prefix: string; suffix: string } {
   const exactText = range.toString();
   const scopeText = scope.toString();
   const targetStartIndex = getRangeTextPosition(range, scope);
   const targetEndIndex = targetStartIndex + exactText.length;
 
-  // Find all matches of the text in the scope.
-  const stringMatches: number[] = [];
+  // Starting with an empty prefix and suffix, we search for matches. At each unintended match
+  // we encounter, we extend the prefix or suffix just enough to ensure it will no longer match.
+  let prefix = '';
+  let suffix = '';
   let fromIndex = 0;
   while (fromIndex <= scopeText.length) {
-    const matchIndex = scopeText.indexOf(exactText, fromIndex);
-    if (matchIndex === -1) break;
-    stringMatches.push(matchIndex);
-    fromIndex = matchIndex + 1;
-  }
+    const searchPattern = prefix + exactText + suffix;
+    const patternMatchIndex = scopeText.indexOf(searchPattern, fromIndex);
+    if (patternMatchIndex === -1) break;
+    fromIndex = patternMatchIndex + 1;
 
-  // Count for each undesired match the required prefix and suffix lengths, such that either of them
-  // would have invalidated the match.
-  const affixLengthPairs: Array<[number, number]> = [];
-  for (const matchStartIndex of stringMatches) {
+    const matchStartIndex = patternMatchIndex + prefix.length;
     const matchEndIndex = matchStartIndex + exactText.length;
 
     // Skip the found match if it is the actual target.
@@ -87,20 +85,21 @@ function calculateContextForDisambiguation(
       scopeText.substring(matchEndIndex),
       false,
     );
-    affixLengthPairs.push([sufficientPrefixLength, sufficientSuffixLength]);
+
+    // Use either the prefix or suffix, whichever is shortest.
+    if (sufficientPrefixLength <= sufficientSuffixLength) {
+      prefix = scopeText.substring(
+        targetStartIndex - sufficientPrefixLength,
+        targetStartIndex,
+      );
+    } else {
+      suffix = scopeText.substring(
+        targetEndIndex,
+        targetEndIndex + sufficientSuffixLength,
+      );
+    }
   }
 
-  // Find the prefix and suffix that would invalidate all mismatches, using the minimal characters
-  // for prefix and suffix combined.
-  const [prefixLength, suffixLength] = minimalSolution(affixLengthPairs);
-  const prefix = scopeText.substring(
-    targetStartIndex - prefixLength,
-    targetStartIndex,
-  );
-  const suffix = scopeText.substring(
-    targetEndIndex,
-    targetEndIndex + suffixLength,
-  );
   return { prefix, suffix };
 }
 
@@ -123,30 +122,6 @@ function charactersNeededToBeUnique(
   else return overlap + 1;
 }
 
-function minimalSolution(
-  requirements: Array<[number, number]>,
-): [number, number] {
-  // Ensure we try solutions with an empty prefix or suffix.
-  requirements.push([0, 0]);
-
-  // Build all the pairs and order them by their sums.
-  const pairs = requirements.flatMap((l) =>
-    requirements.map<[number, number]>((r) => [l[0], r[1]]),
-  );
-  pairs.sort((a, b) => a[0] + a[1] - (b[0] + b[1]));
-
-  // Find the first pair that satisfies every requirement.
-  for (const pair of pairs) {
-    const [p0, p1] = pair;
-    if (requirements.every(([r0, r1]) => r0 <= p0 || r1 <= p1)) {
-      return pair;
-    }
-  }
-
-  // Return the largest pairing (unreachable).
-  return pairs[pairs.length - 1];
-}
-
 // Get the index of the first character of range within the text of scope.
 function getRangeTextPosition(range: Range, scope: Range): number {
   const iter = ownerDocument(scope).createNodeIterator(


[incubator-annotator] 02/03: More performance tweaking

Posted by ge...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch faster-describeTextQuote
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git

commit 35d3ebe00083d53570a829638a6dd489f6e7ce5e
Author: Gerben <ge...@treora.com>
AuthorDate: Fri Sep 18 14:45:54 2020 +0200

    More performance tweaking
    
    I suppose substring()ing the whole scope may be a performance eater if
    the interpreter copies the string every time.
---
 packages/dom/src/text-quote/describe.ts | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/packages/dom/src/text-quote/describe.ts b/packages/dom/src/text-quote/describe.ts
index b599252..93cbe53 100644
--- a/packages/dom/src/text-quote/describe.ts
+++ b/packages/dom/src/text-quote/describe.ts
@@ -76,14 +76,18 @@ function calculateContextForDisambiguation(
 
     // Count how many characters before & after them the false match and target have in common.
     const sufficientPrefixLength = charactersNeededToBeUnique(
-      scopeText.substring(0, targetStartIndex),
-      scopeText.substring(0, matchStartIndex),
+      scopeText,
+      targetStartIndex,
+      matchStartIndex,
       true,
+      prefix.length,
     );
     const sufficientSuffixLength = charactersNeededToBeUnique(
-      scopeText.substring(targetEndIndex),
-      scopeText.substring(matchEndIndex),
+      scopeText,
+      targetEndIndex,
+      matchEndIndex,
       false,
+      suffix.length,
     );
 
     // Use either the prefix or suffix, whichever is shortest.
@@ -104,21 +108,16 @@ function calculateContextForDisambiguation(
 }
 
 function charactersNeededToBeUnique(
-  target: string,
-  impostor: string,
+  text: string,
+  target: number,
+  impostor: number,
   reverse = false,
-) {
-  // Count how many characters the two strings have in common.
-  let overlap = 0;
-  const charAt = (s: string, i: number) =>
-    reverse ? s[s.length - 1 - i] : s[overlap];
-  while (
-    overlap < target.length &&
-    charAt(target, overlap) === charAt(impostor, overlap)
-  )
+  overlap = 0,
+): number {
+  const nextChar = (offset: number) => reverse ? text[offset - 1 - overlap] : text[offset + overlap];
+  while (nextChar(target) && nextChar(target) === nextChar(impostor))
     overlap++;
-  if (overlap === target.length) return Infinity;
-  // (no substring of target can make it distinguishable from its impostor)
+  if (!nextChar(target)) return Infinity; // end/start of string reached.
   else return overlap + 1;
 }
 


[incubator-annotator] 03/03: Add test, fix bug

Posted by ge...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch faster-describeTextQuote
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git

commit 486c59f60b9125e87f250c8b7a725242cc942b0d
Author: Gerben <ge...@treora.com>
AuthorDate: Fri Sep 18 15:29:59 2020 +0200

    Add test, fix bug
---
 packages/dom/src/text-quote/describe.ts        |  2 ++
 packages/dom/test/text-quote/describe-cases.ts | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/packages/dom/src/text-quote/describe.ts b/packages/dom/src/text-quote/describe.ts
index 93cbe53..50a5653 100644
--- a/packages/dom/src/text-quote/describe.ts
+++ b/packages/dom/src/text-quote/describe.ts
@@ -92,6 +92,8 @@ function calculateContextForDisambiguation(
 
     // Use either the prefix or suffix, whichever is shortest.
     if (sufficientPrefixLength <= sufficientSuffixLength) {
+      // Compensate our search position for the increase in prefix length.
+      fromIndex -= sufficientPrefixLength - prefix.length;
       prefix = scopeText.substring(
         targetStartIndex - sufficientPrefixLength,
         targetStartIndex,
diff --git a/packages/dom/test/text-quote/describe-cases.ts b/packages/dom/test/text-quote/describe-cases.ts
index 7d43a5f..142437d 100644
--- a/packages/dom/test/text-quote/describe-cases.ts
+++ b/packages/dom/test/text-quote/describe-cases.ts
@@ -104,6 +104,21 @@ export const testCases: {
       suffix: '',
     },
   },
+  'multiple, overlapping false matches': {
+    html: '<b>aaaaaaaaaa</b>',
+    range: {
+      startContainerXPath: '//b/text()',
+      startOffset: 4,
+      endContainerXPath: '//b/text()',
+      endOffset: 7,
+    },
+    expected: {
+      type: 'TextQuoteSelector',
+      exact: 'aaa',
+      prefix: 'aaaa',
+      suffix: 'aaa',
+    },
+  },
   'empty quote': {
     html: '<b>To annotate or not to annotate</b>',
     range: {