You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@annotator.apache.org by ge...@apache.org on 2020/10/15 15:02:39 UTC
[incubator-annotator] branch import-dom-seek updated: WIP make
Seeker count character offset.
This is an automated email from the ASF dual-hosted git repository.
gerben pushed a commit to branch import-dom-seek
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git
The following commit(s) were added to refs/heads/import-dom-seek by this push:
new c8f7365 WIP make Seeker count character offset.
c8f7365 is described below
commit c8f7365b9037df7da0608fb79883f03bb0b04e48
Author: Gerben <ge...@treora.com>
AuthorDate: Wed Oct 14 15:23:31 2020 +0200
WIP make Seeker count character offset.
---
packages/dom/src/seek.ts | 164 +++++++++++++++++++-------------
packages/dom/src/text-position/match.ts | 36 +------
packages/dom/src/text-quote/match.ts | 27 +++---
3 files changed, 117 insertions(+), 110 deletions(-)
diff --git a/packages/dom/src/seek.ts b/packages/dom/src/seek.ts
index 1344dc8..2d0efdc 100644
--- a/packages/dom/src/seek.ts
+++ b/packages/dom/src/seek.ts
@@ -1,14 +1,40 @@
import { ownerDocument } from "./owner-document";
const E_END = 'Iterator exhausted before seek ended.';
-const E_WHERE = 'Argument of seek must be an integer or a Text Node.';
export class Seeker {
- iter: NodeIterator;
+ // The node containing our current text position.
+ get referenceNode(): Text {
+ // The NodeFilter will guarantee this is a Text node (except before the
+ // first iteration step, but we do such a step in the constructor).
+ return this.iter.referenceNode as Text;
+ }
+
+ // The position inside iter.referenceNode where the last seek ended up.
+ offsetInReferenceNode = 0;
+
+ // The index of the first character of iter.referenceNode inside the text.
+ // get referenceNodeIndex() { return this.position - this.offsetInReferenceNode; }
+ referenceNodeIndex = 0;
+
+ // The current text position, i.e. the number of code units passed so far.
+ // position = 0;
+ get position() { return this.referenceNodeIndex + this.offsetInReferenceNode; }
+
+ // // The number of code points passed so far.
+ // codePointCount = 0;
+
+ private iter: NodeIterator;
+
+ // // Counting code points is optional, to save the effort when it is not required.
+ // private countCodePoints: boolean;
- constructor(scope: Range) {
- const document = ownerDocument(scope);
- this.iter = document.createNodeIterator(
+ constructor(scope: Range, options: {
+ // countCodePoints?: boolean
+ } = {}) {
+ // this.countCodePoints = options.countCodePoints ?? false;
+
+ this.iter = ownerDocument(scope).createNodeIterator(
scope.commonAncestorContainer,
NodeFilter.SHOW_TEXT,
{
@@ -19,82 +45,92 @@ export class Seeker {
},
},
);
- }
- getCurrentNode() {
- return this.iter.referenceNode;
+ if (isText(scope.startContainer)) {
+ // The scope starts inside the text node. Adjust our index accordingly.
+ this.referenceNodeIndex = -scope.startOffset;
+ this.offsetInReferenceNode = scope.startOffset;
+ }
+ // TODO Handle the scope.endOffset as well, and fix behaviour in edge cases
+ // (e.g. any use of referenceNode.length is incorrect at the edges).
+
+ // Walk to the start of the first non-empty text node inside the scope.
+ this.seekTo(0);
}
- seek(where: number | Text): number {
- const iter = this.iter;
-
- let count = 0;
- let node: Node | null = iter.referenceNode;
- let predicates = null;
-
- if (isInteger(where)) {
- predicates = {
- forward: () => count < where,
- backward: () => count > where || !iter.pointerBeforeReferenceNode,
- };
- } else if (isText(where)) {
- predicates = {
- forward: before(node, where) ? () => false : () => node !== where,
- backward: () => node !== where || !iter.pointerBeforeReferenceNode,
- };
- } else {
- throw new TypeError(E_WHERE);
- }
+ // seekCodePoints(count: number) {
+ // }
- while (predicates.forward()) {
- node = iter.nextNode();
+ seekBy(count: number) {
+ return this.seekTo(this.position + count);
+ }
- if (node === null) {
- throw new RangeError(E_END);
+ seekTo(target: number) {
+ // Move the iterator to after the current node, so nextNode() would cause a jump.
+ if (this.iter.pointerBeforeReferenceNode)
+ this.iter.nextNode();
+
+ while (this.position <= target) {
+ if (target < this.referenceNodeIndex + this.referenceNode.length) {
+ // The target is before the end of the current node.
+ // (we use < not ≤: if the target is *at* the end of the node, possibly
+ // because the current node is empty, we prefer to take the next node)
+ this.offsetInReferenceNode = target - this.referenceNodeIndex;
+ // if (this.countCodePoints)
+ // this.codePointCount += [...this.referenceNode.data.substring(oldOffset, this.offsetInReferenceNode)].length;
+ break;
}
- count += (node as Text).data.length;
- }
-
- // If there are subsequent nodes, move to ‘before’ the next non-empty
- // node (or the last node, in case all subsequent nodes are empty).
- // As this moves from ‘after’ the current node, count is not changed.
- if (iter.nextNode()) {
- node = iter.referenceNode;
- while (node !== null && (node as Text).data.length === 0) { // node should always be Text now due to the NodeFilter.
- node = iter.nextNode();
+ // Move to the start of the next node, while counting the characters of the current one.
+ const curNode = this.referenceNode;
+ const nextNode = this.iter.nextNode();
+ if (nextNode !== null) {
+ this.referenceNodeIndex += curNode.length;
+ this.offsetInReferenceNode = 0;
+ // if (this.countCodePoints)
+ // this.codePointCount += [...curNode.data].length;
+ } else {
+ // There is no next node. Finish at the end of the last node.
+ this.offsetInReferenceNode = this.referenceNode.length;
+ // if (this.countCodePoints)
+ // this.codePointCount += [...this.referenceNode.data.substring(this.offsetInReferenceNode)].length;
+ // Either the end of this node is our target, or the seek failed.
+ if (this.position === target)
+ break;
+ else
+ throw new RangeError(E_END);
}
- // Note this direction switch stays within the same node.
- node = iter.previousNode();
}
- while (predicates.backward()) {
- node = iter.previousNode();
+ // Move to the start of the current node.
+ if (!this.iter.pointerBeforeReferenceNode)
+ this.iter.previousNode();
- if (node === null) {
- throw new RangeError(E_END);
+ while (this.position > target) {
+ if (this.referenceNodeIndex <= target) {
+ this.offsetInReferenceNode = target - this.referenceNodeIndex;
+ // if (this.countCodePoints)
+ // this.codePointCount -= [...this.referenceNode.data.substring(this.offsetInReferenceNode, oldOffset)].length;
+ break;
}
- count -= (node as Text).data.length;
- }
-
- if (!isText(iter.referenceNode)) {
- throw new RangeError(E_END);
+ // Move to the end of the previous node.
+ // const curNode = this.referenceNode;
+ const prevNode = this.iter.previousNode();
+ if (prevNode !== null) {
+ this.referenceNodeIndex -= this.referenceNode.length;
+ this.offsetInReferenceNode = this.referenceNode.length;
+ // if (this.countCodePoints)
+ // this.codePointCount -= [...curNode.data].length;
+ } else {
+ this.offsetInReferenceNode = 0;
+ // this.codePointCount -= [...this.referenceNode.data.substring(0, oldOffset)].length;
+ throw new RangeError(E_END);
+ }
}
-
- return count;
}
}
-function isInteger(n: any): n is number {
- if (typeof n !== 'number') return false;
- return isFinite(n) && Math.floor(n) === n;
-}
-
function isText(node: Node): node is Text {
return node.nodeType === Node.TEXT_NODE;
}
-
-function before(ref: Node, node: Node): boolean {
- return !!(ref.compareDocumentPosition(node) & Node.DOCUMENT_POSITION_PRECEDING);
-}
diff --git a/packages/dom/src/text-position/match.ts b/packages/dom/src/text-position/match.ts
index 7a57226..b14a3b2 100644
--- a/packages/dom/src/text-position/match.ts
+++ b/packages/dom/src/text-position/match.ts
@@ -27,51 +27,23 @@ export function createTextPositionSelectorMatcher(
): Matcher<Range, Range> {
return async function* matchAll(scope) {
const document = ownerDocument(scope);
- const scopeText = scope.toString();
const { start, end } = selector;
const seeker = new Seeker(scope);
- // The index of the first character of iter.referenceNode inside the text.
- let referenceNodeIndex = isTextNode(scope.startContainer)
- ? -scope.startOffset
- : 0;
-
- // String indices are based on code points, not code units, so we actually have to count.
- const matchStartIndex = getIndexOfCharacterNumber(scopeText, start);
- const matchEndIndex = getIndexOfCharacterNumber(scopeText, end);
-
// Create a range to represent the described text in the dom.
const match = document.createRange();
// Seek to the start of the match, make the range start there.
- referenceNodeIndex += seeker.seek(matchStartIndex - referenceNodeIndex);
- match.setStart(seeker.getCurrentNode(), matchStartIndex - referenceNodeIndex);
+ seeker.seekTo(start);
+ match.setStart(seeker.referenceNode, seeker.referenceNodeIndex);
// Seek to the end of the match, make the range end there.
- referenceNodeIndex += seeker.seek(matchEndIndex - referenceNodeIndex);
- match.setEnd(seeker.getCurrentNode(), matchEndIndex - referenceNodeIndex);
+ seeker.seekTo(end);
+ match.setEnd(seeker.referenceNode, seeker.referenceNodeIndex);
// Yield the match.
yield match;
};
}
-
-function isTextNode(node: Node): node is Text {
- return node.nodeType === Node.TEXT_NODE;
-}
-
-function getIndexOfCharacterNumber(text: string, characterNumber: number): number {
- let index = 0;
- let characterCount = 0;
- for (let character of text) {
- if (characterCount >= characterNumber) // using >= to avoid infinite loop on invalid input.
- break;
- index += character.length; // note the length is either 1 or 2
- characterCount++;
- }
- if (characterCount === characterNumber)
- return index;
- throw new RangeError;
-}
diff --git a/packages/dom/src/text-quote/match.ts b/packages/dom/src/text-quote/match.ts
index 6282769..d9b7da9 100644
--- a/packages/dom/src/text-quote/match.ts
+++ b/packages/dom/src/text-quote/match.ts
@@ -34,12 +34,15 @@ export function createTextQuoteSelectorMatcher(
const suffix = selector.suffix || '';
const searchPattern = prefix + exact + suffix;
- const seeker = new Seeker(scope);
-
- // The index of the first character of iter.referenceNode inside the text.
- let referenceNodeIndex = isTextNode(scope.startContainer)
- ? -scope.startOffset
- : 0;
+ let seeker: Seeker;
+ try {
+ seeker = new Seeker(scope);
+ } catch (error) {
+ // If the scope does not contain text nodes, we can stop. (if it contains
+ // only empty text nodes we continue: it would still match an empty quote)
+ if (error instanceof RangeError) return;
+ else throw error;
+ }
let fromIndex = 0;
while (fromIndex <= scopeText.length) {
@@ -55,12 +58,12 @@ export function createTextQuoteSelectorMatcher(
const match = document.createRange();
// Seek to the start of the match, make the range start there.
- referenceNodeIndex += seeker.seek(matchStartIndex - referenceNodeIndex);
- match.setStart(seeker.getCurrentNode(), matchStartIndex - referenceNodeIndex);
+ seeker.seekTo(matchStartIndex);
+ match.setStart(seeker.referenceNode, seeker.offsetInReferenceNode);
// Seek to the end of the match, make the range end there.
- referenceNodeIndex += seeker.seek(matchEndIndex - referenceNodeIndex);
- match.setEnd(seeker.getCurrentNode(), matchEndIndex - referenceNodeIndex);
+ seeker.seekTo(matchEndIndex);
+ match.setEnd(seeker.referenceNode, seeker.offsetInReferenceNode);
// Yield the match.
yield match;
@@ -70,7 +73,3 @@ export function createTextQuoteSelectorMatcher(
}
};
}
-
-function isTextNode(node: Node): node is Text {
- return node.nodeType === Node.TEXT_NODE;
-}