You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@annotator.apache.org by ge...@apache.org on 2020/11/05 21:44:36 UTC

[incubator-annotator] 03/03: Make text quote search chunk by chunk

This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch import-dom-seek
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git

commit 0b3a9f6965bc4d3f2afbf666c0d44413a67d8663
Author: Gerben <ge...@treora.com>
AuthorDate: Thu Nov 5 16:22:45 2020 +0100

    Make text quote search chunk by chunk
    
    This allows a document to be provided piecemeal.
    
    The behaviour changes slightly for matches that end at the end of a
    node/chunk: we no longer put the match at the start of the next node,
    in order to avoid needlessly asking the document for new chunks.
---
 packages/dom/src/text-quote/match.ts        | 133 +++++++++++++++++++++-------
 packages/dom/test/text-quote/match-cases.ts |   8 +-
 packages/dom/test/text-quote/match.test.ts  |   8 +-
 3 files changed, 108 insertions(+), 41 deletions(-)

diff --git a/packages/dom/src/text-quote/match.ts b/packages/dom/src/text-quote/match.ts
index 0b3d3af..d18fbf8 100644
--- a/packages/dom/src/text-quote/match.ts
+++ b/packages/dom/src/text-quote/match.ts
@@ -19,57 +19,124 @@
  */
 
 import type { Matcher, TextQuoteSelector } from '@annotator/selector';
-import { ownerDocument } from '../owner-document';
-import { DomSeeker } from '../seek';
+import { TextNodeChunker, Chunk, Chunker } from '../chunker';
+
+export interface ChunkRange<TChunk extends Chunk<any>> {
+  startChunk: TChunk;
+  startIndex: number;
+  endChunk: TChunk;
+  endIndex: number;
+}
 
 export function createTextQuoteSelectorMatcher(
   selector: TextQuoteSelector,
 ): Matcher<Range, Range> {
+  const abstractMatcher = abstractTextQuoteSelectorMatcher(selector);
   return async function* matchAll(scope) {
-    const document = ownerDocument(scope);
-    const scopeText = scope.toString();
+    const textChunks = new TextNodeChunker(scope);
+
+    for await (const abstractMatch of abstractMatcher(textChunks)) {
+      const match = document.createRange();
+      // The `+…startOffset` parts are only relevant for the first chunk, as it
+      // might start within a text node.
+      match.setStart(abstractMatch.startChunk.node,
+        abstractMatch.startIndex + abstractMatch.startChunk.startOffset);
+      match.setEnd(abstractMatch.endChunk.node,
+        abstractMatch.endIndex + abstractMatch.endChunk.startOffset);
+      yield match;
+    }
+  }
+}
 
+type AbstractMatcher<TChunk extends Chunk<string>> =
+  Matcher<Chunker<TChunk>, ChunkRange<TChunk>>
+
+export function abstractTextQuoteSelectorMatcher(
+  selector: TextQuoteSelector,
+): AbstractMatcher<any> {
+  return async function* matchAll<TChunk extends Chunk<string>>(textChunks: Chunker<TChunk>) {
     const exact = selector.exact;
     const prefix = selector.prefix || '';
     const suffix = selector.suffix || '';
     const searchPattern = prefix + exact + suffix;
 
-    let seeker: DomSeeker;
-    try {
-      seeker = new DomSeeker(scope);
-    } catch (error) {
-      // If the scope does not contain text nodes, we can stop. (if it contains
-      // only empty text nodes we continue: it would still match an empty quote)
-      if (error instanceof RangeError) return;
-      else throw error;
-    }
+    let partialMatches: Array<{
+      startChunk: TChunk;
+      startIndex: number;
+      charactersMatched: number;
+    }> = [];
 
-    let fromIndex = 0;
-    while (fromIndex <= scopeText.length) {
-      // Find the quote with its prefix and suffix in the string.
-      const patternStartIndex = scopeText.indexOf(searchPattern, fromIndex);
-      if (patternStartIndex === -1) return;
+    let chunk: TChunk | null;
+    while (chunk = textChunks.currentChunk) {
+      const chunkValue = chunk.data;
 
-      // Correct for the prefix and suffix lengths.
-      const matchStartIndex = patternStartIndex + prefix.length;
-      const matchEndIndex = matchStartIndex + exact.length;
+      // Continue checking any partial matches from the previous chunk(s).
+      const remainingPartialMatches: typeof partialMatches = [];
+      for (const { startChunk, startIndex, charactersMatched } of partialMatches) {
+        if (searchPattern.length - charactersMatched > chunkValue.length) {
+          if (chunkValue === searchPattern.substring(charactersMatched, charactersMatched + chunkValue.length)) {
+            // The chunk is too short to complete the match; comparison has to be completed in subsequent chunks.
+            remainingPartialMatches.push({
+              startChunk,
+              startIndex,
+              charactersMatched: charactersMatched + chunkValue.length,
+            });
+          }
+        }
+        else if (chunkValue.startsWith(searchPattern.substring(charactersMatched))) {
+          yield {
+            startChunk,
+            startIndex,
+            endChunk: chunk,
+            endIndex: searchPattern.length - charactersMatched,
+          };
+        }
+      }
+      partialMatches = remainingPartialMatches;
 
-      // Create a range to represent this exact quote in the dom.
-      const match = document.createRange();
+      // Try find the whole pattern in the chunk (possibly multiple times).
+      if (searchPattern.length <= chunkValue.length) {
+        let fromIndex = 0;
+        while (fromIndex <= chunkValue.length) {
+          const patternStartIndex = chunkValue.indexOf(searchPattern, fromIndex);
+          if (patternStartIndex === -1) break;
 
-      // Seek to the start of the match, make the range start there.
-      seeker.seekTo(matchStartIndex);
-      match.setStart(seeker.referenceNode, seeker.offsetInReferenceNode);
+          // Correct for the prefix and suffix lengths.
+          const matchStartIndex = patternStartIndex + prefix.length;
+          const matchEndIndex = matchStartIndex + exact.length;
 
-      // Seek to the end of the match, make the range end there.
-      seeker.seekTo(matchEndIndex);
-      match.setEnd(seeker.referenceNode, seeker.offsetInReferenceNode);
+          yield {
+            startChunk: chunk,
+            startIndex: matchStartIndex,
+            endChunk: chunk,
+            endIndex: matchEndIndex,
+          };
 
-      // Yield the match.
-      yield match;
+          // Advance the search forward to detect multiple occurrences within the same chunk.
+          fromIndex = matchStartIndex + 1;
+        }
+      }
+
+      // Check if this chunk ends with a partial match (or even multiple partial matches).
+      let newPartialMatches: number[] = [];
+      const searchStartPoint = Math.max(chunkValue.length - searchPattern.length + 1, 0);
+      for (let i = searchStartPoint; i < chunkValue.length; i++) {
+        const character = chunkValue[i];
+        newPartialMatches = newPartialMatches.filter(
+          partialMatchStartIndex => (character === searchPattern[i - partialMatchStartIndex])
+        );
+        if (character === searchPattern[0]) newPartialMatches.push(i);
+      }
+      for (const partialMatchStartIndex of newPartialMatches) {
+        partialMatches.push({
+          startChunk: chunk,
+          startIndex: partialMatchStartIndex,
+          charactersMatched: chunkValue.length - partialMatchStartIndex,
+        });
+      }
 
-      // Advance the search forward to detect multiple occurrences.
-      fromIndex = matchStartIndex + 1;
+      if (textChunks.nextChunk() === null)
+        break;
     }
   };
 }
diff --git a/packages/dom/test/text-quote/match-cases.ts b/packages/dom/test/text-quote/match-cases.ts
index 0a4bb0e..dc4d96c 100644
--- a/packages/dom/test/text-quote/match-cases.ts
+++ b/packages/dom/test/text-quote/match-cases.ts
@@ -98,8 +98,8 @@ export const testCases: {
       {
         startContainerXPath: '//i/text()',
         startOffset: 0,
-        endContainerXPath: '//b/text()[2]',
-        endOffset: 0,
+        endContainerXPath: '//i/text()',
+        endOffset: 11,
       },
     ],
   },
@@ -114,8 +114,8 @@ export const testCases: {
       {
         startContainerXPath: '//title/text()',
         startOffset: 4,
-        endContainerXPath: '//b/text()[1]',
-        endOffset: 0,
+        endContainerXPath: '//title/text()',
+        endOffset: 9,
       },
     ],
   },
diff --git a/packages/dom/test/text-quote/match.test.ts b/packages/dom/test/text-quote/match.test.ts
index 276bb5d..6baf2e3 100644
--- a/packages/dom/test/text-quote/match.test.ts
+++ b/packages/dom/test/text-quote/match.test.ts
@@ -59,8 +59,8 @@ describe('createTextQuoteSelectorMatcher', () => {
       {
         startContainerXPath: '//b/text()[13]',
         startOffset: 0,
-        endContainerXPath: '//b/text()[21]',
-        endOffset: 0,
+        endContainerXPath: '//b/text()[20]',
+        endOffset: 1,
       },
     ]);
   });
@@ -88,8 +88,8 @@ describe('createTextQuoteSelectorMatcher', () => {
       {
         startContainerXPath: '//b/text()[4]', // "dolor"
         startOffset: 0,
-        endContainerXPath: '//b/text()[8]', // "et yada yada"
-        endOffset: 0,
+        endContainerXPath: '//b/text()[6]', // " am"
+        endOffset: 3,
       },
     ]);
   });