You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@annotator.apache.org by ge...@apache.org on 2020/10/15 15:02:39 UTC
[incubator-annotator] branch import-dom-seek updated: WIP make Seeker count character offset.

This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch import-dom-seek
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git


The following commit(s) were added to refs/heads/import-dom-seek by this push:
     new c8f7365  WIP make Seeker count character offset.
c8f7365 is described below

commit c8f7365b9037df7da0608fb79883f03bb0b04e48
Author: Gerben <ge...@treora.com>
AuthorDate: Wed Oct 14 15:23:31 2020 +0200

    WIP make Seeker count character offset.
---
 packages/dom/src/seek.ts                | 164 +++++++++++++++++++-------------
 packages/dom/src/text-position/match.ts |  36 +------
 packages/dom/src/text-quote/match.ts    |  27 +++---
 3 files changed, 117 insertions(+), 110 deletions(-)

diff --git a/packages/dom/src/seek.ts b/packages/dom/src/seek.ts
index 1344dc8..2d0efdc 100644
--- a/packages/dom/src/seek.ts
+++ b/packages/dom/src/seek.ts
@@ -1,14 +1,40 @@
 import { ownerDocument } from "./owner-document";
 
 const E_END = 'Iterator exhausted before seek ended.';
-const E_WHERE = 'Argument of seek must be an integer or a Text Node.';
 
 export class Seeker {
-  iter: NodeIterator;
+  // The node containing our current text position.
+  get referenceNode(): Text {
+    // The NodeFilter will guarantee this is a Text node (except before the
+    // first iteration step, but we do such a step in the constructor).
+    return this.iter.referenceNode as Text;
+  }
+
+  // The position inside iter.referenceNode where the last seek ended up.
+  offsetInReferenceNode = 0;
+
+  // The index of the first character of iter.referenceNode inside the text.
+  // get referenceNodeIndex() { return this.position - this.offsetInReferenceNode; }
+  referenceNodeIndex = 0;
+
+  // The current text position, i.e. the number of code units passed so far.
+  // position = 0;
+  get position() { return this.referenceNodeIndex + this.offsetInReferenceNode; }
+
+  // // The number of code points passed so far.
+  // codePointCount = 0;
+
+  private iter: NodeIterator;
+
+  // // Counting code points is optional, to save the effort when it is not required.
+  // private countCodePoints: boolean;
 
-  constructor(scope: Range) {
-    const document = ownerDocument(scope);
-    this.iter = document.createNodeIterator(
+  constructor(scope: Range, options: {
+    // countCodePoints?: boolean
+  } = {}) {
+    // this.countCodePoints = options.countCodePoints ?? false;
+
+    this.iter = ownerDocument(scope).createNodeIterator(
       scope.commonAncestorContainer,
       NodeFilter.SHOW_TEXT,
       {
@@ -19,82 +45,92 @@ export class Seeker {
         },
       },
     );
-  }
 
-  getCurrentNode() {
-    return this.iter.referenceNode;
+    if (isText(scope.startContainer)) {
+      // The scope starts inside the text node. Adjust our index accordingly.
+      this.referenceNodeIndex = -scope.startOffset;
+      this.offsetInReferenceNode = scope.startOffset;
+    }
+    // TODO Handle the scope.endOffset as well, and fix behaviour in edge cases
+    // (e.g. any use of referenceNode.length is incorrect at the edges).
+
+    // Walk to the start of the first non-empty text node inside the scope.
+    this.seekTo(0);
   }
 
-  seek(where: number | Text): number {
-    const iter = this.iter;
-
-    let count = 0;
-    let node: Node | null = iter.referenceNode;
-    let predicates = null;
-
-    if (isInteger(where)) {
-      predicates = {
-        forward: () => count < where,
-        backward: () => count > where || !iter.pointerBeforeReferenceNode,
-      };
-    } else if (isText(where)) {
-      predicates = {
-        forward: before(node, where) ? () => false : () => node !== where,
-        backward: () => node !== where || !iter.pointerBeforeReferenceNode,
-      };
-    } else {
-      throw new TypeError(E_WHERE);
-    }
+  // seekCodePoints(count: number) {
+  // }
 
-    while (predicates.forward()) {
-      node = iter.nextNode();
+  seekBy(count: number) {
+    return this.seekTo(this.position + count);
+  }
 
-      if (node === null) {
-        throw new RangeError(E_END);
+  seekTo(target: number) {
+    // Move the iterator to after the current node, so nextNode() would cause a jump.
+    if (this.iter.pointerBeforeReferenceNode)
+      this.iter.nextNode();
+
+    while (this.position <= target) {
+      if (target < this.referenceNodeIndex + this.referenceNode.length) {
+        // The target is before the end of the current node.
+        // (we use < not ≤: if the target is *at* the end of the node, possibly
+        // because the current node is empty, we prefer to take the next node)
+        this.offsetInReferenceNode = target - this.referenceNodeIndex;
+        // if (this.countCodePoints)
+        //   this.codePointCount += [...this.referenceNode.data.substring(oldOffset, this.offsetInReferenceNode)].length;
+        break;
       }
 
-      count += (node as Text).data.length;
-    }
-
-    // If there are subsequent nodes, move to ‘before’ the next non-empty
-    // node (or the last node, in case all subsequent nodes are empty).
-    // As this moves from ‘after’ the current node, count is not changed.
-    if (iter.nextNode()) {
-      node = iter.referenceNode;
-      while (node !== null && (node as Text).data.length === 0) { // node should always be Text now due to the NodeFilter.
-        node = iter.nextNode();
+      // Move to the start of the next node, while counting the characters of the current one.
+      const curNode = this.referenceNode;
+      const nextNode = this.iter.nextNode();
+      if (nextNode !== null) {
+        this.referenceNodeIndex += curNode.length;
+        this.offsetInReferenceNode = 0;
+        // if (this.countCodePoints)
+        //   this.codePointCount += [...curNode.data].length;
+      } else {
+        // There is no next node. Finish at the end of the last node.
+        this.offsetInReferenceNode = this.referenceNode.length;
+        // if (this.countCodePoints)
+        //   this.codePointCount += [...this.referenceNode.data.substring(this.offsetInReferenceNode)].length;
+        // Either the end of this node is our target, or the seek failed.
+        if (this.position === target)
+          break;
+        else
+          throw new RangeError(E_END);
       }
-      // Note this direction switch stays within the same node.
-      node = iter.previousNode();
     }
 
-    while (predicates.backward()) {
-      node = iter.previousNode();
+    // Move to the start of the current node.
+    if (!this.iter.pointerBeforeReferenceNode)
+      this.iter.previousNode();
 
-      if (node === null) {
-        throw new RangeError(E_END);
+    while (this.position > target) {
+      if (this.referenceNodeIndex <= target) {
+        this.offsetInReferenceNode = target - this.referenceNodeIndex;
+        // if (this.countCodePoints)
+        //   this.codePointCount -= [...this.referenceNode.data.substring(this.offsetInReferenceNode, oldOffset)].length;
+        break;
       }
 
-      count -= (node as Text).data.length;
-    }
-
-    if (!isText(iter.referenceNode)) {
-      throw new RangeError(E_END);
+      // Move to the end of the previous node.
+      // const curNode = this.referenceNode;
+      const prevNode = this.iter.previousNode();
+      if (prevNode !== null) {
+        this.referenceNodeIndex -= this.referenceNode.length;
+        this.offsetInReferenceNode = this.referenceNode.length;
+        // if (this.countCodePoints)
+        //   this.codePointCount -= [...curNode.data].length;
+      } else {
+        this.offsetInReferenceNode = 0;
+        // this.codePointCount -= [...this.referenceNode.data.substring(0, oldOffset)].length;
+        throw new RangeError(E_END);
+      }
     }
-
-    return count;
   }
 }
 
-function isInteger(n: any): n is number {
-  if (typeof n !== 'number') return false;
-  return isFinite(n) && Math.floor(n) === n;
-}
-
 function isText(node: Node): node is Text {
   return node.nodeType === Node.TEXT_NODE;
 }
-
-function before(ref: Node, node: Node): boolean {
-  return !!(ref.compareDocumentPosition(node) & Node.DOCUMENT_POSITION_PRECEDING);
-}
diff --git a/packages/dom/src/text-position/match.ts b/packages/dom/src/text-position/match.ts
index 7a57226..b14a3b2 100644
--- a/packages/dom/src/text-position/match.ts
+++ b/packages/dom/src/text-position/match.ts
@@ -27,51 +27,23 @@ export function createTextPositionSelectorMatcher(
 ): Matcher<Range, Range> {
   return async function* matchAll(scope) {
     const document = ownerDocument(scope);
-    const scopeText = scope.toString();
 
     const { start, end } = selector;
 
     const seeker = new Seeker(scope);
 
-    // The index of the first character of iter.referenceNode inside the text.
-    let referenceNodeIndex = isTextNode(scope.startContainer)
-      ? -scope.startOffset
-      : 0;
-
-    // String indices are based on code points, not code units, so we actually have to count.
-    const matchStartIndex = getIndexOfCharacterNumber(scopeText, start);
-    const matchEndIndex = getIndexOfCharacterNumber(scopeText, end);
-
     // Create a range to represent the described text in the dom.
     const match = document.createRange();
 
     // Seek to the start of the match, make the range start there.
-    referenceNodeIndex += seeker.seek(matchStartIndex - referenceNodeIndex);
-    match.setStart(seeker.getCurrentNode(), matchStartIndex - referenceNodeIndex);
+    seeker.seekTo(start);
+    match.setStart(seeker.referenceNode, seeker.referenceNodeIndex);
 
     // Seek to the end of the match, make the range end there.
-    referenceNodeIndex += seeker.seek(matchEndIndex - referenceNodeIndex);
-    match.setEnd(seeker.getCurrentNode(), matchEndIndex - referenceNodeIndex);
+    seeker.seekTo(end);
+    match.setEnd(seeker.referenceNode, seeker.referenceNodeIndex);
 
     // Yield the match.
     yield match;
   };
 }
-
-function isTextNode(node: Node): node is Text {
-  return node.nodeType === Node.TEXT_NODE;
-}
-
-function getIndexOfCharacterNumber(text: string, characterNumber: number): number {
-  let index = 0;
-  let characterCount = 0;
-  for (let character of text) {
-    if (characterCount >= characterNumber) // using >= to avoid infinite loop on invalid input.
-      break;
-    index += character.length; // note the length is either 1 or 2
-    characterCount++;
-  }
-  if (characterCount === characterNumber)
-    return index;
-  throw new RangeError;
-}
diff --git a/packages/dom/src/text-quote/match.ts b/packages/dom/src/text-quote/match.ts
index 6282769..d9b7da9 100644
--- a/packages/dom/src/text-quote/match.ts
+++ b/packages/dom/src/text-quote/match.ts
@@ -34,12 +34,15 @@ export function createTextQuoteSelectorMatcher(
     const suffix = selector.suffix || '';
     const searchPattern = prefix + exact + suffix;
 
-    const seeker = new Seeker(scope);
-
-    // The index of the first character of iter.referenceNode inside the text.
-    let referenceNodeIndex = isTextNode(scope.startContainer)
-      ? -scope.startOffset
-      : 0;
+    let seeker: Seeker;
+    try {
+      seeker = new Seeker(scope);
+    } catch (error) {
+      // If the scope does not contain text nodes, we can stop. (if it contains
+      // only empty text nodes we continue: it would still match an empty quote)
+      if (error instanceof RangeError) return;
+      else throw error;
+    }
 
     let fromIndex = 0;
     while (fromIndex <= scopeText.length) {
@@ -55,12 +58,12 @@ export function createTextQuoteSelectorMatcher(
       const match = document.createRange();
 
       // Seek to the start of the match, make the range start there.
-      referenceNodeIndex += seeker.seek(matchStartIndex - referenceNodeIndex);
-      match.setStart(seeker.getCurrentNode(), matchStartIndex - referenceNodeIndex);
+      seeker.seekTo(matchStartIndex);
+      match.setStart(seeker.referenceNode, seeker.offsetInReferenceNode);
 
       // Seek to the end of the match, make the range end there.
-      referenceNodeIndex += seeker.seek(matchEndIndex - referenceNodeIndex);
-      match.setEnd(seeker.getCurrentNode(), matchEndIndex - referenceNodeIndex);
+      seeker.seekTo(matchEndIndex);
+      match.setEnd(seeker.referenceNode, seeker.offsetInReferenceNode);
 
       // Yield the match.
       yield match;
@@ -70,7 +73,3 @@ export function createTextQuoteSelectorMatcher(
     }
   };
 }
-
-function isTextNode(node: Node): node is Text {
-  return node.nodeType === Node.TEXT_NODE;
-}