You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@annotator.apache.org by ge...@apache.org on 2020/11/12 22:08:11 UTC

[incubator-annotator] 03/04: Make describeTextQuote work, mostly.

This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch import-dom-seek
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git

commit 657daab660a91ce7057136e5417326b7ae42f239
Author: Gerben <ge...@treora.com>
AuthorDate: Thu Nov 12 12:14:24 2020 +0100

    Make describeTextQuote work, mostly.
    
    Tests pass, still some issues with empty quotes at chunk edges
---
 packages/dom/src/chunker.ts             |  4 +-
 packages/dom/src/normalize-range.ts     |  4 ++
 packages/dom/src/seek.ts                | 19 ++++---
 packages/dom/src/text-quote/describe.ts | 84 +++++++++++++++++++++++-------
 packages/dom/src/text-quote/match.ts    | 90 ++++++++++++++++++++++-----------
 5 files changed, 145 insertions(+), 56 deletions(-)

diff --git a/packages/dom/src/chunker.ts b/packages/dom/src/chunker.ts
index 7209d7a..2becebf 100644
--- a/packages/dom/src/chunker.ts
+++ b/packages/dom/src/chunker.ts
@@ -104,11 +104,13 @@ export class TextNodeChunker implements Chunker<PartialTextNode> {
 
   rangeToChunkRange(range: Range): ChunkRange<PartialTextNode> {
     const textRange = normalizeRange(range);
+    // FIXME: normalizeRange can mess up: a collapsed range at the very end of
+    // the chunker’s scope might move to the next text node outside the scope.
 
     const startChunk = this.nodeToChunk(textRange.startContainer);
     const startIndex = textRange.startOffset - startChunk.startOffset;
     const endChunk = this.nodeToChunk(textRange.endContainer);
-    const endIndex = textRange.endOffset - endChunk.endOffset;
+    const endIndex = textRange.endOffset - endChunk.startOffset;
 
     return { startChunk, startIndex, endChunk, endIndex };
   }
diff --git a/packages/dom/src/normalize-range.ts b/packages/dom/src/normalize-range.ts
index 8616bce..0fece41 100644
--- a/packages/dom/src/normalize-range.ts
+++ b/packages/dom/src/normalize-range.ts
@@ -48,6 +48,10 @@ export interface TextRange extends Range {
 // if there are equivalent text selections it takes the narrowest option (i.e.
 // it prefers the start not to be at the end of a text node, and vice versa).
 //
+// If there is no text between the start and end, they thus collapse onto one a
+// single position; and if there are multiple equivalent positions, it takes the
+// first one.
+//
 // Note that if the given range does not contain non-empty text nodes, it will
 // end up pointing at a text node outside of it (after it if possible, else
 // before). If the document does not contain any text nodes, an error is thrown.
diff --git a/packages/dom/src/seek.ts b/packages/dom/src/seek.ts
index 1c821a9..36914a7 100644
--- a/packages/dom/src/seek.ts
+++ b/packages/dom/src/seek.ts
@@ -111,15 +111,14 @@ export class TextSeeker<TChunk extends Chunk<string>> implements Seeker<string>
 
     if (this.position <= target) {
       while (this.position <= target) { // could be `while (true)`?
-        if (!roundUp && target < this.currentChunkPosition + this.currentChunk.data.length) {
-          // The target is before the end of the current chunk.
+        if (
+          this.currentChunkPosition + this.currentChunk.data.length <= target
+          || (roundUp && this.offsetInChunk !== 0)
+        ) {
+          // The target is beyond the current chunk.
           // (we use < not ≤: if the target is *at* the end of the chunk, possibly
           // because the current chunk is empty, we prefer to take the next chunk)
-          const newOffset = target - this.currentChunkPosition;
-          if (read) result += this.currentChunk.data.substring(this.offsetInChunk, newOffset);
-          this.offsetInChunk = newOffset;
-          break;
-        } else {
+
           // Move to the start of the next chunk, while counting the characters of the current one.
           if (read) result += this.currentChunk.data.substring(this.offsetInChunk);
           const chunkLength = this.currentChunk.data.length;
@@ -140,6 +139,12 @@ export class TextSeeker<TChunk extends Chunk<string>> implements Seeker<string>
             else
               throw new RangeError(E_END);
           }
+        } else {
+          // The target is within the current chunk.
+          const newOffset = target - this.currentChunkPosition;
+          if (read) result += this.currentChunk.data.substring(this.offsetInChunk, newOffset);
+          this.offsetInChunk = newOffset;
+          break;
         }
       }
     } else { // Similar to the if-block, but moving backward in the text.
diff --git a/packages/dom/src/text-quote/describe.ts b/packages/dom/src/text-quote/describe.ts
index cbad0c3..1a7941d 100644
--- a/packages/dom/src/text-quote/describe.ts
+++ b/packages/dom/src/text-quote/describe.ts
@@ -26,17 +26,18 @@ import { TextSeeker, NonEmptyChunker } from '../seek';
 
 export async function describeTextQuote(
   range: Range,
-  scope?: Range,
+  maybeScope?: Range,
 ): Promise<TextQuoteSelector> {
   // Default to search in the whole document.
-  if (scope === undefined) {
+  let scope: Range;
+  if (maybeScope !== undefined) {
+    scope = maybeScope;
+  } else {
     const document = ownerDocument(range);
     scope = document.createRange();
     scope.selectNodeContents(document);
   }
 
-  const chunker = new TextNodeChunker(scope);
-
   // Take the part of the range that falls within the scope.
   range = range.cloneRange();
   if (range.compareBoundaryPoints(Range.START_TO_START, scope) === -1)
@@ -44,17 +45,21 @@ export async function describeTextQuote(
   if (range.compareBoundaryPoints(Range.END_TO_END, scope) === 1)
     range.setEnd(scope.endContainer, scope.endOffset);
 
+  const chunker = new TextNodeChunker(scope);
+
   return await abstractDescribeTextQuote(
     chunker.rangeToChunkRange(range),
-    chunker,
+    () => new TextNodeChunker(scope),
   );
 }
 
 async function abstractDescribeTextQuote<TChunk extends Chunk<string>>(
   target: ChunkRange<TChunk>,
-  scope: Chunker<TChunk>,
+  scope: () => Chunker<TChunk>,
 ): Promise<TextQuoteSelector> {
-  const seeker = new TextSeeker(scope as NonEmptyChunker<TChunk>);
+  const seeker = new TextSeeker(scope() as NonEmptyChunker<TChunk>);
+
+  // Read the target’s exact text.
   seeker.seekToChunk(target.startChunk, target.startIndex);
   const exact = seeker.readToChunk(target.endChunk, target.endIndex);
 
@@ -70,7 +75,8 @@ async function abstractDescribeTextQuote<TChunk extends Chunk<string>>(
       prefix,
       suffix,
     }
-    const matches = abstractTextQuoteSelectorMatcher(tentativeSelector)(scope);
+
+    const matches = abstractTextQuoteSelectorMatcher(tentativeSelector)(scope());
     let nextMatch = await matches.next();
 
     if (!nextMatch.done && chunkRangeEquals(nextMatch.value, target)) {
@@ -81,17 +87,19 @@ async function abstractDescribeTextQuote<TChunk extends Chunk<string>>(
     // If there are no more unintended matches, our selector is unambiguous!
     if (nextMatch.done) return tentativeSelector;
 
-    // TODO either reset chunker to start from the beginning, or rewind the chunker by previous match’s length.
-    // chunker.seekTo(0)  or chunker.seek(-prefix)
+    // A subsequent search could safely skip the part we already processed,
+    // we’d need the matcher to start at the seeker’s position, instead of
+    // searching in the whole current chunk.
+    // seeker.seekBy(-prefix.length + 1);
 
     // We’ll have to add more prefix/suffix to disqualify this unintended match.
     const unintendedMatch = nextMatch.value;
-    const seeker1 = new TextSeeker(scope as NonEmptyChunker<TChunk>);
-    const seeker2 = new TextSeeker(scope as NonEmptyChunker<TChunk>); // TODO must clone scope.
+    const seeker1 = new TextSeeker(scope() as NonEmptyChunker<TChunk>);
+    const seeker2 = new TextSeeker(scope() as NonEmptyChunker<TChunk>);
 
     // Count how many characters we’d need as a prefix to disqualify this match.
-    seeker1.seekToChunk(target.startChunk, target.startIndex);
-    seeker2.seekToChunk(unintendedMatch.startChunk, unintendedMatch.startIndex);
+    seeker1.seekToChunk(target.startChunk, target.startIndex - prefix.length);
+    seeker2.seekToChunk(unintendedMatch.startChunk, unintendedMatch.startIndex - prefix.length);
     let sufficientPrefix: string | undefined = prefix;
     while (true) {
       let previousCharacter: string;
@@ -102,15 +110,53 @@ async function abstractDescribeTextQuote<TChunk extends Chunk<string>>(
         break;
       }
       sufficientPrefix = previousCharacter + sufficientPrefix;
-      if (previousCharacter !== seeker2.read(-1)) break;
+
+      // Break if the newly added character makes the prefix unambiguous.
+      try {
+        const unintendedMatchPreviousCharacter = seeker2.read(-1);
+        if (previousCharacter !== unintendedMatchPreviousCharacter) break;
+      } catch (err) {
+        if (err instanceof RangeError)
+          break;
+        else
+          throw err;
+      }
+    }
+
+    // Count how many characters we’d need as a suffix to disqualify this match.
+    seeker1.seekToChunk(target.endChunk, target.endIndex + suffix.length);
+    seeker2.seekToChunk(unintendedMatch.endChunk, unintendedMatch.endIndex + suffix.length);
+    let sufficientSuffix: string | undefined = suffix;
+    while (true) {
+      let nextCharacter: string;
+      try {
+        nextCharacter = seeker1.read(1);
+      } catch (err) {
+        sufficientSuffix = undefined; // End of text reached.
+        break;
+      }
+      sufficientSuffix += nextCharacter;
+
+      // Break if the newly added character makes the suffix unambiguous.
+      try {
+        const unintendedMatchNextCharacter = seeker2.read(1);
+        if (nextCharacter !== unintendedMatchNextCharacter) break;
+      } catch (err) {
+        if (err instanceof RangeError)
+          break;
+        else
+          throw err;
+      }
     }
 
     // Use either the prefix or suffix, whichever is shortest.
-    if (sufficientPrefix !== undefined && (sufficientSuffix === undefined || sufficientPrefix.length <= sufficientSuffix.length))
-      prefix = sufficientPrefix; // chunker.seek(prefix.length - sufficientPrefix.length)
-    else if (sufficientSuffix !== undefined)
+    if (sufficientPrefix !== undefined && (sufficientSuffix === undefined || sufficientPrefix.length <= sufficientSuffix.length)) {
+      prefix = sufficientPrefix;
+      // seeker.seekBy(sufficientPrefix.length - prefix.length) // Would be required if we’d skip the processed part.
+    } else if (sufficientSuffix !== undefined) {
       suffix = sufficientSuffix;
-    else
+    } else {
       throw new Error('Target cannot be disambiguated; how could that have happened‽');
+    }
   }
 }
diff --git a/packages/dom/src/text-quote/match.ts b/packages/dom/src/text-quote/match.ts
index 112e054..73cd1d7 100644
--- a/packages/dom/src/text-quote/match.ts
+++ b/packages/dom/src/text-quote/match.ts
@@ -53,41 +53,62 @@ export function abstractTextQuoteSelectorMatcher(
     const suffix = selector.suffix || '';
     const searchPattern = prefix + exact + suffix;
 
-    let partialMatches: Array<{
-      startChunk: TChunk;
-      startIndex: number;
+    // The code below runs a loop with three steps:
+    // 1. Continue checking any partial matches from the previous chunk(s).
+    // 2. Try find the whole pattern in the chunk (possibly multiple times).
+    // 3. Check if this chunk ends with a partial match (or even multiple partial matches).
+
+    interface PartialMatch {
+      startChunk?: TChunk;
+      startIndex?: number;
+      endChunk?: TChunk;
+      endIndex?: number;
       charactersMatched: number;
-    }> = [];
+    }
+    let partialMatches: PartialMatch[] = [];
 
     let chunk: TChunk | null;
     while (chunk = textChunks.currentChunk) {
       const chunkValue = chunk.data;
 
-      // Continue checking any partial matches from the previous chunk(s).
+      // 1. Continue checking any partial matches from the previous chunk(s).
       const remainingPartialMatches: typeof partialMatches = [];
-      for (const { startChunk, startIndex, charactersMatched } of partialMatches) {
-        if (searchPattern.length - charactersMatched > chunkValue.length) {
-          if (chunkValue === searchPattern.substring(charactersMatched, charactersMatched + chunkValue.length)) {
-            // The chunk is too short to complete the match; comparison has to be completed in subsequent chunks.
-            remainingPartialMatches.push({
-              startChunk,
-              startIndex,
-              charactersMatched: charactersMatched + chunkValue.length,
-            });
+      for (const partialMatch of partialMatches) {
+        const charactersMatched = partialMatch.charactersMatched;
+
+        // If the current chunk contains the start and/or end of the match, record these.
+        if (partialMatch.endChunk === undefined) {
+          const charactersUntilMatchEnd = prefix.length + exact.length - charactersMatched;
+          if (charactersUntilMatchEnd <= chunkValue.length) {
+            partialMatch.endChunk = chunk;
+            partialMatch.endIndex = charactersUntilMatchEnd;
           }
         }
-        else if (chunkValue.startsWith(searchPattern.substring(charactersMatched))) {
-          yield {
-            startChunk,
-            startIndex,
-            endChunk: chunk,
-            endIndex: searchPattern.length - charactersMatched,
-          };
+        if (partialMatch.startChunk === undefined) {
+          const charactersUntilMatchStart = prefix.length - charactersMatched;
+          if (
+            charactersUntilMatchStart < chunkValue.length
+            || partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk.
+          ) {
+            partialMatch.startChunk = chunk;
+            partialMatch.startIndex = charactersUntilMatchStart;
+          }
+        }
+
+        const charactersUntilSuffixEnd = searchPattern.length - charactersMatched;
+        if (charactersUntilSuffixEnd <= chunkValue.length) {
+          if (chunkValue.startsWith(searchPattern.substring(charactersMatched))) {
+            yield partialMatch as ChunkRange<TChunk>; // all fields are certainly defined now.
+          }
+        } else if (chunkValue === searchPattern.substring(charactersMatched, charactersMatched + chunkValue.length)) {
+          // The chunk is too short to complete the match; comparison has to be completed in subsequent chunks.
+          partialMatch.charactersMatched += chunkValue.length;
+          remainingPartialMatches.push(partialMatch);
         }
       }
       partialMatches = remainingPartialMatches;
 
-      // Try find the whole pattern in the chunk (possibly multiple times).
+      // 2. Try find the whole pattern in the chunk (possibly multiple times).
       if (searchPattern.length <= chunkValue.length) {
         let fromIndex = 0;
         while (fromIndex <= chunkValue.length) {
@@ -106,11 +127,11 @@ export function abstractTextQuoteSelectorMatcher(
           };
 
           // Advance the search forward to detect multiple occurrences within the same chunk.
-          fromIndex = matchStartIndex + 1;
+          fromIndex = patternStartIndex + 1;
         }
       }
 
-      // Check if this chunk ends with a partial match (or even multiple partial matches).
+      // 3. Check if this chunk ends with a partial match (or even multiple partial matches).
       let newPartialMatches: number[] = [];
       const searchStartPoint = Math.max(chunkValue.length - searchPattern.length + 1, 0);
       for (let i = searchStartPoint; i < chunkValue.length; i++) {
@@ -121,11 +142,22 @@ export function abstractTextQuoteSelectorMatcher(
         if (character === searchPattern[0]) newPartialMatches.push(i);
       }
       for (const partialMatchStartIndex of newPartialMatches) {
-        partialMatches.push({
-          startChunk: chunk,
-          startIndex: partialMatchStartIndex,
-          charactersMatched: chunkValue.length - partialMatchStartIndex,
-        });
+        const charactersMatched = chunkValue.length - partialMatchStartIndex;
+        const partialMatch: PartialMatch = {
+          charactersMatched,
+        };
+        if (charactersMatched >= prefix.length + exact.length) {
+          partialMatch.endChunk = chunk;
+          partialMatch.endIndex = partialMatchStartIndex + prefix.length + exact.length;
+        }
+        if (
+          charactersMatched > prefix.length
+          || partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk.
+        ) {
+          partialMatch.startChunk = chunk;
+          partialMatch.startIndex = partialMatchStartIndex + prefix.length;
+        }
+        partialMatches.push(partialMatch);
       }
 
       if (textChunks.nextChunk() === null)