You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@annotator.apache.org by ge...@apache.org on 2020/11/11 17:38:02 UTC

[incubator-annotator] 01/01: Change approach, (re)implement normalizeRange

This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch import-dom-seek
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git

commit 0f13c976cb7779e317cd9076425a2259d2b8bafb
Author: Gerben <ge...@treora.com>
AuthorDate: Wed Nov 11 16:54:10 2020 +0100

    Change approach, (re)implement normalizeRange
---
 packages/dom/src/chunker.ts             |  25 ++++++
 packages/dom/src/normalize-range.ts     | 135 ++++++++++++++++++++++++++++++++
 packages/dom/src/seek.ts                |  21 +----
 packages/dom/src/text-quote/describe.ts |  20 +----
 4 files changed, 166 insertions(+), 35 deletions(-)

diff --git a/packages/dom/src/chunker.ts b/packages/dom/src/chunker.ts
index c8e3015..7209d7a 100644
--- a/packages/dom/src/chunker.ts
+++ b/packages/dom/src/chunker.ts
@@ -18,6 +18,7 @@
  * under the License.
  */
 
+import { normalizeRange } from "./normalize-range";
 import { ownerDocument } from "./owner-document";
 
 // A Chunk represents a fragment (typically a string) of some document.
@@ -78,6 +79,12 @@ export class TextNodeChunker implements Chunker<PartialTextNode> {
     const node = this.iter.referenceNode;
     if (!isText(node))
       return null;
+    return this.nodeToChunk(node);
+  }
+
+  nodeToChunk(node: Text): PartialTextNode {
+    if (!this.scope.intersectsNode(node))
+      throw new Error('Cannot convert node to chunk, as it falls outside of chunker’s scope.');
     const startOffset = (node === this.scope.startContainer) ? this.scope.startOffset : 0;
     const endOffset = (node === this.scope.endContainer) ? this.scope.endOffset : node.length;
     return {
@@ -85,9 +92,27 @@ export class TextNodeChunker implements Chunker<PartialTextNode> {
       startOffset,
       endOffset,
       data: node.data.substring(startOffset, endOffset),
+      equals(other) {
+        return (
+          other.node === this.node
+          && other.startOffset === this.startOffset
+          && other.endOffset === this.endOffset
+        );
+      },
     }
   }
 
+  rangeToChunkRange(range: Range): ChunkRange<PartialTextNode> {
+    const textRange = normalizeRange(range);
+
+    const startChunk = this.nodeToChunk(textRange.startContainer);
+    const startIndex = textRange.startOffset - startChunk.startOffset;
+    const endChunk = this.nodeToChunk(textRange.endContainer);
+    const endIndex = textRange.endOffset - endChunk.endOffset;
+
+    return { startChunk, startIndex, endChunk, endIndex };
+  }
+
   constructor(private scope: Range) {
     this.iter = ownerDocument(scope).createNodeIterator(
       scope.commonAncestorContainer,
diff --git a/packages/dom/src/normalize-range.ts b/packages/dom/src/normalize-range.ts
new file mode 100644
index 0000000..8616bce
--- /dev/null
+++ b/packages/dom/src/normalize-range.ts
@@ -0,0 +1,135 @@
+/**
+ * @license
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { ownerDocument } from "./owner-document";
+
+// TextRange is a Range that guarantees to always have Text nodes as its start
+// and end nodes. To ensure the type remains correct, it also restricts usage
+// of methods that would modify these nodes (note that a user can simply cast
+// the TextRange back to a Range to remove these restrictions).
+export interface TextRange extends Range {
+  readonly startContainer: Text;
+  readonly endContainer: Text;
+  cloneRange(): TextRange;
+
+  // Allow only Text nodes to be passed to these methods.
+  insertNode(node: Text): void;
+  selectNodeContents(node: Text): void;
+  setEnd(node: Text, offset: number): void;
+  setStart(node: Text, offset: number): void;
+
+  // Do not allow these methods to be used at all.
+  selectNode(node: never): void;
+  setEndAfter(node: never): void;
+  setEndBefore(node: never): void;
+  setStartAfter(node: never): void;
+  setStartBefore(node: never): void;
+  surroundContents(newParent: never): void;
+}
+
+// Normalise a range such that both its start and end are text nodes, and that
+// if there are equivalent text selections it takes the narrowest option (i.e.
+// it prefers the start not to be at the end of a text node, and vice versa).
+//
+// Note that if the given range does not contain non-empty text nodes, it will
+// end up pointing at a text node outside of it (after it if possible, else
+// before). If the document does not contain any text nodes, an error is thrown.
+export function normalizeRange(range: Range): TextRange {
+  const document = ownerDocument(range);
+  const walker = document.createTreeWalker(document, NodeFilter.SHOW_TEXT);
+
+  let [ startContainer, startOffset ] = snapBoundaryPointToTextNode(range.startContainer, range.startOffset);
+
+  // If we point at the end of a text node, move to the start of the next one.
+  // The step is repeated to skip over empty text nodes.
+  walker.currentNode = startContainer;
+  while (startOffset === startContainer.length && walker.nextNode()) {
+    startContainer = walker.currentNode as Text;
+    startOffset = 0;
+  }
+
+  range.setStart(startContainer, startOffset);
+
+  let [ endContainer, endOffset ] = snapBoundaryPointToTextNode(range.endContainer, range.endOffset);
+
+  // If we point at the start of a text node, move to the end of the previous one.
+  // The step is repeated to skip over empty text nodes.
+  walker.currentNode = endContainer;
+  while (endOffset === 0 && walker.previousNode()) {
+    endContainer = walker.currentNode as Text;
+    endOffset = endContainer.length;
+  }
+
+  range.setEnd(endContainer, endOffset);
+
+  return range as TextRange;
+}
+
+// Given an arbitrary boundary point, this returns either:
+// - that same boundary point, if its node is a text node;
+// - otherwise the first boundary point after it whose node is a text node, if any;
+// - otherwise, the last boundary point before it whose node is a text node.
+// If the document has no text nodes, it throws an error.
+function snapBoundaryPointToTextNode(node: Node, offset: number): [Text, number] {
+  if (isText(node))
+    return [node, offset];
+
+  // Find the node at or right after the boundary point.
+  let curNode: Node;
+  if (isCharacterData(node)) {
+    curNode = node;
+  } else if (offset < node.childNodes.length) {
+    curNode = node.childNodes[offset];
+  } else {
+    curNode = node;
+    while (curNode.nextSibling === null) {
+      if (curNode.parentNode === null) // Boundary point is at end of document
+        throw new Error('not implemented'); // TODO
+      curNode = curNode.parentNode;
+    }
+    curNode = curNode.nextSibling;
+  }
+
+  if (isText(curNode))
+    return [curNode, 0];
+
+  // Walk to the next text node, or the last if there is none.
+  const document = node.ownerDocument ?? node as Document;
+  const walker = document.createTreeWalker(document, NodeFilter.SHOW_TEXT);
+  walker.currentNode = curNode;
+  if (walker.nextNode() !== null)
+    return [walker.currentNode as Text, 0];
+  else if (walker.previousNode() !== null)
+    return [walker.currentNode as Text, (walker.currentNode as Text).length];
+  else
+    throw new Error('Document contains no text nodes.');
+}
+
+function isText(node: Node): node is Text {
+  return node.nodeType === Node.TEXT_NODE;
+}
+
+function isCharacterData(node: Node): node is CharacterData {
+  return (
+    node.nodeType === Node.PROCESSING_INSTRUCTION_NODE
+    || node.nodeType === Node.COMMENT_NODE
+    || node.nodeType === Node.TEXT_NODE
+  );
+}
diff --git a/packages/dom/src/seek.ts b/packages/dom/src/seek.ts
index 7d7c107..cd314b6 100644
--- a/packages/dom/src/seek.ts
+++ b/packages/dom/src/seek.ts
@@ -18,7 +18,7 @@
  * under the License.
  */
 
-import { Chunk, Chunker, TextNodeChunker, PartialTextNode, chunkEquals } from "./chunker";
+import { Chunk, TextNodeChunker, PartialTextNode, chunkEquals } from "./chunker";
 
 const E_END = 'Iterator exhausted before seek ended.';
 
@@ -165,10 +165,8 @@ export class TextSeeker<TChunk extends Chunk<string>> implements Seeker<string>
 }
 
 export class DomSeeker extends TextSeeker<PartialTextNode> implements BoundaryPointer<Text> {
-  constructor(chunkerOrScope: Chunker<PartialTextNode> | Range) {
-    const chunker = 'currentChunk' in chunkerOrScope
-      ? chunkerOrScope
-      : new TextNodeChunker(chunkerOrScope);
+  constructor(scope: Range) {
+    const chunker = new TextNodeChunker(scope);
     if (chunker.currentChunk === null)
       throw new RangeError('Range does not contain any Text nodes.');
     super(chunker as NonEmptyChunker<PartialTextNode>);
@@ -181,17 +179,4 @@ export class DomSeeker extends TextSeeker<PartialTextNode> implements BoundaryPo
   get offsetInReferenceNode() {
     return this.offsetInChunk + this.currentChunk.startOffset;
   }
-
-  seekToBoundaryPoint(node: Node, offset: number) {
-    const document = (node.ownerDocument ?? node as Document);
-    const target = document.createRange();
-    target.setStart(node, offset);
-    // target.setEnd(node, offset); // (implied by setting the start)
-
-    // Seek step by step until we are at, or crossed, the target point.
-    const reverse = !!(node.compareDocumentPosition(this.referenceNode) & Node.DOCUMENT_POSITION_PRECEDING);
-    while (target.comparePoint(this.referenceNode, this.offsetInReferenceNode) === (reverse ? 1 : -1)) {
-      this.seekBy(reverse ? -1 : 1);
-    }
-  }
 }
diff --git a/packages/dom/src/text-quote/describe.ts b/packages/dom/src/text-quote/describe.ts
index 8ccf47e..cbad0c3 100644
--- a/packages/dom/src/text-quote/describe.ts
+++ b/packages/dom/src/text-quote/describe.ts
@@ -20,9 +20,9 @@
 
 import type { TextQuoteSelector } from '@annotator/selector';
 import { ownerDocument } from '../owner-document';
-import { Chunk, Chunker, ChunkRange, PartialTextNode, TextNodeChunker, chunkRangeEquals } from '../chunker';
+import { Chunk, Chunker, ChunkRange, TextNodeChunker, chunkRangeEquals } from '../chunker';
 import { abstractTextQuoteSelectorMatcher } from '.';
-import { DomSeeker, TextSeeker, NonEmptyChunker } from '../seek';
+import { TextSeeker, NonEmptyChunker } from '../seek';
 
 export async function describeTextQuote(
   range: Range,
@@ -45,7 +45,7 @@ export async function describeTextQuote(
     range.setEnd(scope.endContainer, scope.endOffset);
 
   return await abstractDescribeTextQuote(
-    convertRangeToChunkRange(chunker, range),
+    chunker.rangeToChunkRange(range),
     chunker,
   );
 }
@@ -114,17 +114,3 @@ async function abstractDescribeTextQuote<TChunk extends Chunk<string>>(
       throw new Error('Target cannot be disambiguated; how could that have happened‽');
   }
 }
-
-function convertRangeToChunkRange(chunker: Chunker<PartialTextNode>, range: Range): ChunkRange<PartialTextNode> {
-  const domSeeker = new DomSeeker(chunker);
-
-  domSeeker.seekToBoundaryPoint(range.startContainer, range.startOffset);
-  const startChunk = domSeeker.currentChunk;
-  const startIndex = domSeeker.offsetInChunk;
-
-  domSeeker.seekToBoundaryPoint(range.endContainer, range.endOffset);
-  const endChunk = domSeeker.currentChunk;
-  const endIndex = domSeeker.offsetInChunk;
-
-  return { startChunk, startIndex, endChunk, endIndex };
-}