You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@annotator.apache.org by ge...@apache.org on 2020/09/16 17:40:03 UTC

[incubator-annotator] 02/04: Support matching across chunks

This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch chunking
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git

commit 4e1d317dcef8ba89de53267961ed2f286da1cd6f
Author: Gerben <ge...@treora.com>
AuthorDate: Wed Sep 16 15:03:00 2020 +0200

    Support matching across chunks
---
 packages/dom/src/text-quote/match.ts       | 83 +++++++++++++++++++++++-------
 packages/dom/test/text-quote/match.test.ts |  8 +--
 2 files changed, 69 insertions(+), 22 deletions(-)

diff --git a/packages/dom/src/text-quote/match.ts b/packages/dom/src/text-quote/match.ts
index 6b7fd93..c3fb43d 100644
--- a/packages/dom/src/text-quote/match.ts
+++ b/packages/dom/src/text-quote/match.ts
@@ -54,36 +54,83 @@ interface AbstractRange<TChunk> {
 export function abstractTextQuoteSelectorMatcher(
   selector: TextQuoteSelector,
 ): <TChunk extends Chunk>(textChunks: AsyncIterable<TChunk>) => AsyncGenerator<AbstractRange<TChunk>, void, void> {
-  return async function* matchAll(textChunks) {
+  return async function* matchAll<TChunk extends Chunk>(textChunks: AsyncIterable<TChunk>) {
     const exact = selector.exact;
     const prefix = selector.prefix || '';
     const suffix = selector.suffix || '';
     const searchPattern = prefix + exact + suffix;
 
+    let partialMatches: Array<{
+      startChunk: TChunk;
+      startIndex: number;
+      charactersMatched: number;
+    }> = [];
+
     for await (const chunk of textChunks) {
       const chunkValue = chunk.toString();
 
-      // Find the pattern in the chunk (possibly multiple times)
-      // TODO allow pattern to be spread across chunks
-      let fromIndex = 0;
-      while (fromIndex <= chunkValue.length) {
-        const patternStartIndex = chunkValue.indexOf(searchPattern, fromIndex);
-        if (patternStartIndex === -1) break;
+      // Continue checking any partial matches from the previous chunk(s).
+      const remainingPartialMatches: typeof partialMatches = [];
+      for (const { startChunk, startIndex, charactersMatched } of partialMatches) {
+        if (searchPattern.length - charactersMatched > chunkValue.length) {
+          if (chunkValue === searchPattern.substring(charactersMatched, charactersMatched + chunkValue.length)) {
+            // The chunk is too short to complete the match; comparison has to be completed in subsequent chunks.
+            remainingPartialMatches.push({
+              startChunk,
+              startIndex,
+              charactersMatched: charactersMatched + chunkValue.length,
+            });
+          }
+        }
+        else if (chunkValue.startsWith(searchPattern.substring(charactersMatched))) {
+          yield {
+            startChunk,
+            startIndex,
+            endChunk: chunk,
+            endIndex: searchPattern.length - charactersMatched,
+          };
+        }
+      }
+      partialMatches = remainingPartialMatches;
+
+      // Try find the whole pattern in the chunk (possibly multiple times).
+      if (searchPattern.length <= chunkValue.length) {
+        let fromIndex = 0;
+        while (fromIndex <= chunkValue.length) {
+          const patternStartIndex = chunkValue.indexOf(searchPattern, fromIndex);
+          if (patternStartIndex === -1) break;
 
-        // Correct for the prefix and suffix lengths.
-        const matchStartIndex = patternStartIndex + prefix.length;
-        const matchEndIndex = matchStartIndex + exact.length;
+          // Correct for the prefix and suffix lengths.
+          const matchStartIndex = patternStartIndex + prefix.length;
+          const matchEndIndex = matchStartIndex + exact.length;
 
-        yield {
-          startChunk: chunk,
-          startIndex: matchStartIndex,
-          endChunk: chunk,
-          endIndex: matchEndIndex,
-        };
+          yield {
+            startChunk: chunk,
+            startIndex: matchStartIndex,
+            endChunk: chunk,
+            endIndex: matchEndIndex,
+          };
+
+          // Advance the search forward to detect multiple occurrences within the same chunk.
+          fromIndex = matchStartIndex + 1;
+        }
+      }
 
-        // Advance the search forward to detect multiple occurrences within the same chunk.
-        fromIndex = matchStartIndex + 1;
+      // Check if this chunk ends with a partial match (or even multiple partial matches).
+      let newPartialMatches: number[] = [];
+      const searchStartPoint = Math.max(chunkValue.length - searchPattern.length + 1, 0);
+      for (let i = searchStartPoint; i < chunkValue.length; i++) {
+        const character = chunkValue[i];
+        newPartialMatches = newPartialMatches.filter(
+          partialMatchStartIndex => (character === searchPattern[i - partialMatchStartIndex])
+        );
+        if (character === searchPattern[0]) newPartialMatches.push(i);
       }
+      newPartialMatches.forEach(partialMatchStartIndex => partialMatches.push({
+        startChunk: chunk,
+        startIndex: partialMatchStartIndex,
+        charactersMatched: chunkValue.length - partialMatchStartIndex,
+      }));
     }
   };
 }
diff --git a/packages/dom/test/text-quote/match.test.ts b/packages/dom/test/text-quote/match.test.ts
index 7d52e15..fe8d36b 100644
--- a/packages/dom/test/text-quote/match.test.ts
+++ b/packages/dom/test/text-quote/match.test.ts
@@ -60,8 +60,8 @@ describe('createTextQuoteSelectorMatcher', () => {
       {
         startContainerXPath: '//b/text()[13]',
         startOffset: 0,
-        endContainerXPath: '//b/text()[21]',
-        endOffset: 0,
+        endContainerXPath: '//b/text()[20]',
+        endOffset: 1,
       },
     ]);
   });
@@ -89,8 +89,8 @@ describe('createTextQuoteSelectorMatcher', () => {
       {
         startContainerXPath: '//b/text()[4]', // "dolor"
         startOffset: 0,
-        endContainerXPath: '//b/text()[8]', // "et yada yada"
-        endOffset: 0,
+        endContainerXPath: '//b/text()[6]', // " am"
+        endOffset: 3,
       },
     ]);
   });