You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@annotator.apache.org by ge...@apache.org on 2020/11/06 13:19:30 UTC

[incubator-annotator] branch import-dom-seek updated (0b3a9f6 -> 15e8ffd)

This is an automated email from the ASF dual-hosted git repository.

gerben pushed a change to branch import-dom-seek
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git.


    from 0b3a9f6  Make text quote search chunk by chunk
     new c025372  Simplify TextSeeker chunk access
     new 15e8ffd  WIP Make describing a text quote work too

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 packages/dom/src/chunker.ts             |   1 +
 packages/dom/src/seek.ts                |  67 ++++++----
 packages/dom/src/text-quote/describe.ts | 214 ++++++++++++++++++--------------
 3 files changed, 163 insertions(+), 119 deletions(-)

[incubator-annotator] 02/02: WIP Make describing a text quote work too

Posted by ge...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch import-dom-seek
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git

commit 15e8ffd485cf0c15c864c2f63540f9963c8e48e4
Author: Gerben <ge...@treora.com>
AuthorDate: Fri Nov 6 14:18:45 2020 +0100

    WIP Make describing a text quote work too
---
 packages/dom/src/chunker.ts             |   1 +
 packages/dom/src/seek.ts                |  26 +++-
 packages/dom/src/text-quote/describe.ts | 214 ++++++++++++++++++--------------
 3 files changed, 145 insertions(+), 96 deletions(-)

diff --git a/packages/dom/src/chunker.ts b/packages/dom/src/chunker.ts
index 82a72e3..0f3b746 100644
--- a/packages/dom/src/chunker.ts
+++ b/packages/dom/src/chunker.ts
@@ -25,6 +25,7 @@ import { ownerDocument } from "./owner-document";
 // data structure it came from (e.g. a DOM node).
 export interface Chunk<TData extends any> {
   readonly data: TData;
+  equals?(otherChunk: this): boolean;
 }
 
 // A Chunker lets one walk through the chunks of a document.
diff --git a/packages/dom/src/seek.ts b/packages/dom/src/seek.ts
index f09e583..9ceade3 100644
--- a/packages/dom/src/seek.ts
+++ b/packages/dom/src/seek.ts
@@ -18,7 +18,7 @@
  * under the License.
  */
 
-import { Chunk, TextNodeChunker, PartialTextNode } from "./chunker";
+import { Chunk, Chunker, TextNodeChunker, PartialTextNode } from "./chunker";
 
 const E_END = 'Iterator exhausted before seek ended.';
 
@@ -142,10 +142,11 @@ class TextSeeker<TChunk extends Chunk<string>> implements Seeker<string> {
   }
 }
 
-
 export class DomSeeker extends TextSeeker<PartialTextNode> implements BoundaryPointer<Text> {
-  constructor(scope: Range) {
-    const chunker = new TextNodeChunker(scope);
+  constructor(chunkerOrScope: Chunker<PartialTextNode> | Range) {
+    const chunker = isTextNodeChunker(chunkerOrScope)
+      ? chunkerOrScope
+      : new TextNodeChunker(chunkerOrScope);
     if (chunker.currentChunk === null)
       throw new RangeError('Range does not contain any Text nodes.');
     super(chunker as NonEmptyChunker<PartialTextNode>);
@@ -158,4 +159,21 @@ export class DomSeeker extends TextSeeker<PartialTextNode> implements BoundaryPo
   get offsetInReferenceNode() {
     return this.offsetInChunk + this.currentChunk.startOffset;
   }
+
+  seekToBoundaryPoint(node: Node, offset: number) {
+    const document = (node.ownerDocument ?? node as Document);
+    const target = document.createRange();
+    target.setStart(node, offset);
+    // target.setEnd(node, offset); // (implied by setting the start)
+
+    // Seek step by step until we are at, or crossed, the target point.
+    const reverse = !!(node.compareDocumentPosition(this.referenceNode) & Node.DOCUMENT_POSITION_PRECEDING);
+    while (target.comparePoint(this.referenceNode, this.offsetInReferenceNode) === (reverse ? 1 : -1)) {
+      this.seekBy(reverse ? -1 : 1);
+    }
+  }
+}
+
+function isTextNodeChunker(obj: any): obj is Chunker<PartialTextNode> {
+  return ('currentChunk' in obj && 'nextChunk' in obj && 'previousChunk' in obj);
 }
diff --git a/packages/dom/src/text-quote/describe.ts b/packages/dom/src/text-quote/describe.ts
index e5846bc..787d60d 100644
--- a/packages/dom/src/text-quote/describe.ts
+++ b/packages/dom/src/text-quote/describe.ts
@@ -20,7 +20,9 @@
 
 import type { TextQuoteSelector } from '@annotator/selector';
 import { ownerDocument } from '../owner-document';
-import { Seeker } from '../seek';
+import { Chunk, Chunker, PartialTextNode, TextNodeChunker } from '../chunker';
+import { ChunkRange, abstractTextQuoteSelectorMatcher } from '.';
+import { DomSeeker } from '../seek';
 
 export async function describeTextQuote(
   range: Range,
@@ -32,120 +34,148 @@ export async function describeTextQuote(
     scope = document.createRange();
     scope.selectNodeContents(document);
   }
-  range = range.cloneRange();
+
+  const chunker = new TextNodeChunker(scope);
 
   // Take the part of the range that falls within the scope.
+  range = range.cloneRange();
   if (range.compareBoundaryPoints(Range.START_TO_START, scope) === -1)
     range.setStart(scope.startContainer, scope.startOffset);
   if (range.compareBoundaryPoints(Range.END_TO_END, scope) === 1)
     range.setEnd(scope.endContainer, scope.endOffset);
 
-  return {
-    type: 'TextQuoteSelector',
-    exact: range.toString(),
-    ...calculateContextForDisambiguation(range, scope),
-  };
+  return await abstractDescribeTextQuote(
+    convertRangeToChunkRange(chunker, range),
+    chunker,
+  );
 }
 
-function calculateContextForDisambiguation(
-  range: Range,
-  scope: Range,
-): { prefix: string; suffix: string } {
-  const exactText = range.toString();
-  const scopeText = scope.toString();
-  const targetStartIndex = getRangeTextPosition(range, scope);
-  const targetEndIndex = targetStartIndex + exactText.length;
+async function abstractDescribeTextQuote(
+  target: ChunkRange<Chunk<string>>,
+  scope: Chunker<Chunk<string>>,
+): Promise<TextQuoteSelector> {
+  const exact = readChunkRange(scope, target);
 
   // Starting with an empty prefix and suffix, we search for matches. At each unintended match
   // we encounter, we extend the prefix or suffix just enough to ensure it will no longer match.
   let prefix = '';
   let suffix = '';
-  let fromIndex = 0;
-  while (fromIndex <= scopeText.length) {
-    const searchPattern = prefix + exactText + suffix;
-    const patternMatchIndex = scopeText.indexOf(searchPattern, fromIndex);
-    if (patternMatchIndex === -1) break;
-    fromIndex = patternMatchIndex + 1;
-
-    const matchStartIndex = patternMatchIndex + prefix.length;
-    const matchEndIndex = matchStartIndex + exactText.length;
-
-    // Skip the found match if it is the actual target.
-    if (matchStartIndex === targetStartIndex) continue;
-
-    // Count how many characters we’d need as a prefix to disqualify this match.
-    let sufficientPrefixLength = prefix.length + 1;
-    const firstChar = (offset: number) =>
-      scopeText[offset - sufficientPrefixLength];
-    while (
-      firstChar(targetStartIndex) &&
-      firstChar(targetStartIndex) === firstChar(matchStartIndex)
-    )
-      sufficientPrefixLength++;
-    if (!firstChar(targetStartIndex))
-      // We reached the start of scopeText; prefix won’t work.
-      sufficientPrefixLength = Infinity;
-
-    // Count how many characters we’d need as a suffix to disqualify this match.
-    let sufficientSuffixLength = suffix.length + 1;
-    const lastChar = (offset: number) =>
-      scopeText[offset + sufficientSuffixLength - 1];
-    while (
-      lastChar(targetEndIndex) &&
-      lastChar(targetEndIndex) === lastChar(matchEndIndex)
-    )
-      sufficientSuffixLength++;
-    if (!lastChar(targetEndIndex))
-      // We reached the end of scopeText; suffix won’t work.
-      sufficientSuffixLength = Infinity;
 
-    // Use either the prefix or suffix, whichever is shortest.
-    if (sufficientPrefixLength <= sufficientSuffixLength) {
-      // Compensate our search position for the increase in prefix length.
-      fromIndex -= sufficientPrefixLength - prefix.length;
-      prefix = scopeText.substring(
-        targetStartIndex - sufficientPrefixLength,
-        targetStartIndex,
-      );
-    } else {
-      suffix = scopeText.substring(
-        targetEndIndex,
-        targetEndIndex + sufficientSuffixLength,
-      );
+  while (true) {
+    const tentativeSelector: TextQuoteSelector = {
+      type: 'TextQuoteSelector',
+      exact,
+      prefix,
+      suffix,
+    }
+    const matches = abstractTextQuoteSelectorMatcher(tentativeSelector)(scope);
+    let nextMatch = await matches.next();
+
+    if (!nextMatch.done && chunkRangeEquals(nextMatch.value, target)) {
+      // This match is the intended one, ignore it.
+      nextMatch = await matches.next();
     }
+
+    // If there are no more unintended matches, our selector is unambiguous!
+    if (nextMatch.done) return tentativeSelector;
+
+    // We’ll have to add more prefix/suffix to disqualify this unintended match.
+    const match = nextMatch.value;
+    const sufficientPrefix = charactersNeededToBeUnique(scope, match, target, true);
+    const sufficientSuffix = charactersNeededToBeUnique(scope, match, target, false);
+
+    // Use either the prefix or suffix, whichever is shortest.
+    if (sufficientPrefix !== undefined && (sufficientSuffix === undefined || sufficientPrefix.length <= sufficientSuffix.length))
+      prefix = sufficientPrefix;
+    else if (sufficientSuffix !== undefined)
+      suffix = sufficientSuffix;
+    else
+      throw new Error('Target cannot be disambiguated; how could that have happened‽');
   }
+}
+
+function charactersNeededToBeUnique(
+  chunker: Chunker<Chunk<string>>,
+  match: ChunkRange<any>,
+  target: ChunkRange<any>,
+  reverse: boolean,
+): string | undefined {
+  // TODO. How?
+
+  // // Count how many characters we’d need as a prefix to disqualify this match.
+  // let sufficientPrefixLength = prefix.length + 1;
+  // const firstChar = (offset: number) =>
+  //   scopeText[offset - sufficientPrefixLength];
+  // while (
+  //   firstChar(targetStartIndex) &&
+  //   firstChar(targetStartIndex) === firstChar(matchStartIndex)
+  // )
+  //   sufficientPrefixLength++;
+  // if (!firstChar(targetStartIndex))
+  //   // We reached the start of scopeText; prefix won’t work.
+  //   sufficientPrefixLength = Infinity;
+
+  // // Count how many characters we’d need as a suffix to disqualify this match.
+  // let sufficientSuffixLength = suffix.length + 1;
+  // const lastChar = (offset: number) =>
+  //   scopeText[offset + sufficientSuffixLength - 1];
+  // while (
+  //   lastChar(targetEndIndex) &&
+  //   lastChar(targetEndIndex) === lastChar(matchEndIndex)
+  // )
+  //   sufficientSuffixLength++;
+  // if (!lastChar(targetEndIndex))
+  //   // We reached the end of scopeText; suffix won’t work.
+  //   sufficientSuffixLength = Infinity;
 
-  return { prefix, suffix };
 }
 
-// Get the index of the first character of range within the text of scope.
-function getRangeTextPosition(range: Range, scope: Range): number {
-  const seeker = new Seeker(scope);
-  const scopeOffset = isTextNode(scope.startContainer) ? scope.startOffset : 0;
-  if (isTextNode(range.startContainer))
-    return seeker.seek(range.startContainer) + range.startOffset - scopeOffset;
-  else return seeker.seek(firstTextNodeInRange(range)) - scopeOffset;
+function readChunkRange<TChunk extends Chunk<string>>(
+  chunker: Chunker<TChunk>,
+  {
+    startChunk,
+    startIndex,
+    endChunk,
+    endIndex,
+  }: ChunkRange<TChunk>
+): string {
+  if (startChunk === endChunk)
+    return startChunk.data.substring(startIndex, endIndex);
+
+  let text = startChunk.data.substring(startIndex);
+  // TODO use chunker; or implement Seeker.readToChunk or similiar?
+  let curChunk = startChunk;
+  while (curChunk && curChunk !== endChunk) {
+    curChunk = chunker.nextChunk();
+    text += curChunk.data;
+  }
+  text += endChunk.data.substring(0, endIndex);
+  return text;
 }
 
-function firstTextNodeInRange(range: Range): Text {
-  // Find the first text node inside the range.
-  const iter = ownerDocument(range).createNodeIterator(
-    range.commonAncestorContainer,
-    NodeFilter.SHOW_TEXT,
-    {
-      acceptNode(node: Text) {
-        // Only reveal nodes within the range; and skip any empty text nodes.
-        return range.intersectsNode(node) && node.length > 0
-          ? NodeFilter.FILTER_ACCEPT
-          : NodeFilter.FILTER_REJECT;
-      },
-    },
+function chunkRangeEquals(range1: ChunkRange<any>, range2: ChunkRange<any>) {
+  return (
+    chunkEquals(range1.startChunk, range2.startChunk)
+    && chunkEquals(range1.endChunk, range2.endChunk)
+    && range1.startIndex === range2.startIndex
+    && range1.endIndex === range2.endIndex
   );
-  const node = iter.nextNode() as Text | null;
-  if (node === null) throw new Error('Range contains no text nodes');
-  return node;
 }
 
-function isTextNode(node: Node): node is Text {
-  return node.nodeType === Node.TEXT_NODE;
+function chunkEquals(chunk1: Chunk<any>, chunk2: Chunk<any>): boolean {
+  return chunk1.equals ? chunk1.equals(chunk2) : chunk1 === chunk2;
+}
+
+function convertRangeToChunkRange(chunker: Chunker<PartialTextNode>, range: Range): ChunkRange<PartialTextNode> {
+  const domSeeker = new DomSeeker(chunker);
+
+  domSeeker.seekToBoundaryPoint(range.startContainer, range.startOffset);
+  const startChunk = domSeeker.currentChunk;
+  const startIndex = domSeeker.offsetInChunk;
+
+  domSeeker.seekToBoundaryPoint(range.endContainer, range.endOffset);
+  const endChunk = domSeeker.currentChunk;
+  const endIndex = domSeeker.offsetInChunk;
+
+  return { startChunk, startIndex, endChunk, endIndex };
 }

[incubator-annotator] 01/02: Simplify TextSeeker chunk access

Posted by ge...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch import-dom-seek
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git

commit c025372b08197d47061c2e2c7933e0040bddb36d
Author: Gerben <ge...@treora.com>
AuthorDate: Fri Nov 6 14:15:59 2020 +0100

    Simplify TextSeeker chunk access
    
    The symmetry it had with a BoundaryPoint might just be confusing.
---
 packages/dom/src/seek.ts | 41 ++++++++++++++++++-----------------------
 1 file changed, 18 insertions(+), 23 deletions(-)

diff --git a/packages/dom/src/seek.ts b/packages/dom/src/seek.ts
index 94bb5b1..f09e583 100644
--- a/packages/dom/src/seek.ts
+++ b/packages/dom/src/seek.ts
@@ -41,12 +41,17 @@ export interface Seeker<T extends Iterable<any> = string> {
   seekTo(target: number): void;
 }
 
-class _TextSeeker<TChunk extends Chunk<string>> implements Seeker<string> {
+class TextSeeker<TChunk extends Chunk<string>> implements Seeker<string> {
+  // The chunk containing our current text position.
+  get currentChunk() {
+    return this.chunker.currentChunk;
+  }
+
   // The index of the first character of the current chunk inside the text.
   private currentChunkPosition = 0;
 
   // The position inside the chunk where the last seek ended up.
-  protected offsetInChunk = 0;
+  offsetInChunk = 0;
 
   // The current text position (measured in code units)
   get position() { return this.currentChunkPosition + this.offsetInChunk; }
@@ -79,18 +84,18 @@ class _TextSeeker<TChunk extends Chunk<string>> implements Seeker<string> {
 
     if (this.position <= target) {
       while (this.position <= target) { // could be `while (true)`?
-        if (!roundUp && target < this.currentChunkPosition + this.chunker.currentChunk.data.length) {
+        if (!roundUp && target < this.currentChunkPosition + this.currentChunk.data.length) {
           // The target is before the end of the current chunk.
           // (we use < not ≤: if the target is *at* the end of the chunk, possibly
           // because the current chunk is empty, we prefer to take the next chunk)
           const newOffset = target - this.currentChunkPosition;
-          if (read) result += this.chunker.currentChunk.data.substring(this.offsetInChunk, newOffset);
+          if (read) result += this.currentChunk.data.substring(this.offsetInChunk, newOffset);
           this.offsetInChunk = newOffset;
           break;
         } else {
           // Move to the start of the next chunk, while counting the characters of the current one.
-          if (read) result += this.chunker.currentChunk.data.substring(this.offsetInChunk);
-          const chunkLength = this.chunker.currentChunk.data.length;
+          if (read) result += this.currentChunk.data.substring(this.offsetInChunk);
+          const chunkLength = this.currentChunk.data.length;
           let nextChunk = this.chunker.nextChunk();
           if (nextChunk !== null) {
             // Skip empty chunks.
@@ -115,16 +120,16 @@ class _TextSeeker<TChunk extends Chunk<string>> implements Seeker<string> {
         if (this.currentChunkPosition <= target) {
           // The target is within the current chunk.
           const newOffset = roundUp ? 0 : target - this.currentChunkPosition;
-          if (read) result = this.chunker.currentChunk.data.substring(newOffset, this.offsetInChunk) + result;
+          if (read) result = this.currentChunk.data.substring(newOffset, this.offsetInChunk) + result;
           this.offsetInChunk = newOffset;
           break;
         } else {
           // Move to the end of the previous chunk.
-          if (read) result = this.chunker.currentChunk.data.substring(0, this.offsetInChunk) + result;
+          if (read) result = this.currentChunk.data.substring(0, this.offsetInChunk) + result;
           const previousChunk = this.chunker.previousChunk();
           if (previousChunk !== null) {
-            this.currentChunkPosition -= this.chunker.currentChunk.data.length;
-            this.offsetInChunk = this.chunker.currentChunk.data.length;
+            this.currentChunkPosition -= this.currentChunk.data.length;
+            this.offsetInChunk = this.currentChunk.data.length;
           } else {
             this.offsetInChunk = 0;
             throw new RangeError(E_END);
@@ -137,18 +142,8 @@ class _TextSeeker<TChunk extends Chunk<string>> implements Seeker<string> {
   }
 }
 
-export class TextSeeker<TChunk extends Chunk<string>> extends _TextSeeker<TChunk> implements BoundaryPointer<TChunk> {
-  // The chunk containing our current text position.
-  get referenceNode() {
-    return this.chunker.currentChunk;
-  }
-
-  get offsetInReferenceNode() {
-    return this.offsetInChunk;
-  }
-}
 
-export class DomSeeker extends _TextSeeker<PartialTextNode> implements BoundaryPointer<Text> {
+export class DomSeeker extends TextSeeker<PartialTextNode> implements BoundaryPointer<Text> {
   constructor(scope: Range) {
     const chunker = new TextNodeChunker(scope);
     if (chunker.currentChunk === null)
@@ -157,10 +152,10 @@ export class DomSeeker extends _TextSeeker<PartialTextNode> implements BoundaryP
   }
 
   get referenceNode() {
-    return this.chunker.currentChunk.node;
+    return this.currentChunk.node;
   }
 
   get offsetInReferenceNode() {
-    return this.offsetInChunk + this.chunker.currentChunk.startOffset;
+    return this.offsetInChunk + this.currentChunk.startOffset;
   }
 }