You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@annotator.apache.org by ge...@apache.org on 2020/10/08 21:19:14 UTC

[incubator-annotator] 01/01: Implement text-position matching.

This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch text-position
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git

commit ccfd9288cf17331dba46330e899122009a7c7f8d
Author: Gerben <ge...@treora.com>
AuthorDate: Thu Oct 8 23:15:05 2020 +0200

    Implement text-position matching.
    
    Largely copying from the text quote implementation.
    No effort done yet regarding deduplication, abstraction, efficiency.
---
 .../types.ts => dom/src/text-position/index.ts}    |  26 +--
 packages/dom/src/text-position/match.ts            |  88 ++++++++
 packages/dom/test/text-position/match-cases.ts     | 142 +++++++++++++
 packages/dom/test/text-position/match.test.ts      | 227 +++++++++++++++++++++
 packages/selector/src/index.ts                     |   2 +-
 packages/selector/src/types.ts                     |   6 +
 6 files changed, 465 insertions(+), 26 deletions(-)

diff --git a/packages/selector/src/types.ts b/packages/dom/src/text-position/index.ts
similarity index 61%
copy from packages/selector/src/types.ts
copy to packages/dom/src/text-position/index.ts
index fc4f64b..011e994 100644
--- a/packages/selector/src/types.ts
+++ b/packages/dom/src/text-position/index.ts
@@ -18,28 +18,4 @@
  * under the License.
  */
 
-export interface Selector {
-  refinedBy?: Selector;
-}
-
-export interface CssSelector extends Selector {
-  type: 'CssSelector';
-  value: string;
-}
-
-export interface TextQuoteSelector extends Selector {
-  type: 'TextQuoteSelector';
-  exact: string;
-  prefix?: string;
-  suffix?: string;
-}
-
-export interface RangeSelector extends Selector {
-  type: 'RangeSelector';
-  startSelector: Selector;
-  endSelector: Selector;
-}
-
-export interface Matcher<TScope, TMatch> {
-  (scope: TScope): AsyncGenerator<TMatch, void, void>;
-}
+export * from './match';
diff --git a/packages/dom/src/text-position/match.ts b/packages/dom/src/text-position/match.ts
new file mode 100644
index 0000000..a579e94
--- /dev/null
+++ b/packages/dom/src/text-position/match.ts
@@ -0,0 +1,88 @@
+/**
+ * @license
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import seek from 'dom-seek';
+import type { Matcher, TextPositionSelector } from '@annotator/selector';
+import { ownerDocument } from '../owner-document';
+
+export function createTextPositionSelectorMatcher(
+  selector: TextPositionSelector,
+): Matcher<Range, Range> {
+  return async function* matchAll(scope) {
+    const document = ownerDocument(scope);
+    const scopeText = scope.toString();
+
+    const { start, end } = selector;
+
+    const iter = document.createNodeIterator(
+      scope.commonAncestorContainer,
+      NodeFilter.SHOW_TEXT,
+      {
+        acceptNode(node: Text) {
+          // Only reveal nodes within the range; and skip any empty text nodes.
+          return scope.intersectsNode(node) && node.length > 0
+            ? NodeFilter.FILTER_ACCEPT
+            : NodeFilter.FILTER_REJECT;
+        },
+      },
+    );
+
+    // The index of the first character of iter.referenceNode inside the text.
+    let referenceNodeIndex = isTextNode(scope.startContainer)
+      ? -scope.startOffset
+      : 0;
+
+    // String indices are based on code points, not code units, so we actually have to count.
+    const matchStartIndex = getIndexOfCharacterNumber(scopeText, start);
+    const matchEndIndex = getIndexOfCharacterNumber(scopeText, end);
+
+    // Create a range to represent the described text in the dom.
+    const match = document.createRange();
+
+    // Seek to the start of the match, make the range start there.
+    referenceNodeIndex += seek(iter, matchStartIndex - referenceNodeIndex);
+    match.setStart(iter.referenceNode, matchStartIndex - referenceNodeIndex);
+
+    // Seek to the end of the match, make the range end there.
+    referenceNodeIndex += seek(iter, matchEndIndex - referenceNodeIndex);
+    match.setEnd(iter.referenceNode, matchEndIndex - referenceNodeIndex);
+
+    // Yield the match.
+    yield match;
+  };
+}
+
+function isTextNode(node: Node): node is Text {
+  return node.nodeType === Node.TEXT_NODE;
+}
+
+function getIndexOfCharacterNumber(text: string, characterNumber: number): number {
+  let index = 0;
+  let characterCount = 0;
+  for (let character of text) {
+    if (characterCount >= characterNumber) // using >= to avoid infinite loop on invalid input.
+      break;
+    index += character.length; // note the length is either 1 or 2
+    characterCount++;
+  }
+  if (characterCount === characterNumber)
+    return index;
+  throw new RangeError;
+}
diff --git a/packages/dom/test/text-position/match-cases.ts b/packages/dom/test/text-position/match-cases.ts
new file mode 100644
index 0000000..0916446
--- /dev/null
+++ b/packages/dom/test/text-position/match-cases.ts
@@ -0,0 +1,142 @@
+/**
+ * @license
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import type { TextPositionSelector } from '@annotator/selector';
+import type { RangeInfo } from '../utils';
+
+export const testCases: {
+  [name: string]: {
+    html: string;
+    selector: TextPositionSelector;
+    expected: RangeInfo[];
+  };
+} = {
+  simple: {
+    html: '<b>l😃rem ipsum dolor amet yada yada</b>',
+    selector: {
+      type: 'TextPositionSelector',
+      start: 12,
+      end: 20,
+    },
+    expected: [
+      {
+        startContainerXPath: '//b/text()',
+        startOffset: 13,
+        endContainerXPath: '//b/text()',
+        endOffset: 21,
+      },
+    ],
+  },
+  'first characters': {
+    html: '<b>l😃rem ipsum dolor amet yada yada</b>',
+    selector: {
+      type: 'TextPositionSelector',
+      start: 0,
+      end: 11,
+    },
+    expected: [
+      {
+        startContainerXPath: '//b/text()',
+        startOffset: 0,
+        endContainerXPath: '//b/text()',
+        endOffset: 12,
+      },
+    ],
+  },
+  'last characters': {
+    html: '<b>l😃rem ipsum dolor amet yada yada</b>',
+    selector: {
+      type: 'TextPositionSelector',
+      start: 23,
+      end: 32,
+    },
+    expected: [
+      {
+        startContainerXPath: '//b/text()',
+        startOffset: 24,
+        endContainerXPath: '//b/text()',
+        endOffset: 33,
+      },
+    ],
+  },
+  'across elements': {
+    html: '<b>l😃rem <i>ipsum</i> dolor <u>amet</u> yada yada</b>',
+    selector: {
+      type: 'TextPositionSelector',
+      start: 12,
+      end: 20,
+    },
+    expected: [
+      {
+        startContainerXPath: '//b/text()[2]',
+        startOffset: 1,
+        endContainerXPath: '//u/text()',
+        endOffset: 2,
+      },
+    ],
+  },
+  'exact element contents': {
+    html: '<b>l😃rem <i>ipsum dolor</i> amet yada yada</b>',
+    selector: {
+      type: 'TextPositionSelector',
+      start: 6,
+      end: 17,
+    },
+    expected: [
+      {
+        startContainerXPath: '//i/text()',
+        startOffset: 0,
+        endContainerXPath: '//b/text()[2]',
+        endOffset: 0,
+      },
+    ],
+  },
+  'text inside <head>': {
+    html:
+      '<head><title>l😃rem ipsum dolor amet</title></head><b>yada yada</b>',
+    selector: {
+      type: 'TextPositionSelector',
+      start: 18,
+      end: 22,
+    },
+    expected: [
+      {
+        startContainerXPath: '//title/text()',
+        startOffset: 19,
+        endContainerXPath: '//b/text()[1]',
+        endOffset: 0,
+      },
+    ],
+  },
+  'empty quote': {
+    html: '<b>l😃rem</b>',
+    selector: {
+      type: 'TextPositionSelector',
+      start: 3,
+      end: 3,
+    },
+    expected: [{
+      startContainerXPath: '//b/text()',
+      startOffset: 4,
+      endContainerXPath: '//b/text()',
+      endOffset: 4,
+    }],
+  },
+};
diff --git a/packages/dom/test/text-position/match.test.ts b/packages/dom/test/text-position/match.test.ts
new file mode 100644
index 0000000..1acaed0
--- /dev/null
+++ b/packages/dom/test/text-position/match.test.ts
@@ -0,0 +1,227 @@
+/**
+ * @license
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { assert } from 'chai';
+import type { TextPositionSelector } from '@annotator/selector';
+import { createTextPositionSelectorMatcher } from '../../src/text-position/match';
+import { evaluateXPath } from '../utils';
+import type { RangeInfo } from '../utils';
+import { testCases } from './match-cases';
+
+const domParser = new window.DOMParser();
+
+describe('createTextPositionSelectorMatcher', () => {
+  for (const [name, { html, selector, expected }] of Object.entries(
+    testCases,
+  )) {
+    it(`works for case: '${name}'`, async () => {
+      const doc = domParser.parseFromString(html, 'text/html');
+
+      const scope = doc.createRange();
+      scope.selectNodeContents(doc);
+
+      await testMatcher(doc, scope, selector, expected);
+    });
+  }
+
+  it('handles adjacent text nodes', async () => {
+    const { html, selector } = testCases['simple'];
+    const doc = domParser.parseFromString(html, 'text/html');
+
+    const scope = doc.createRange();
+    scope.selectNodeContents(doc);
+
+    const textNode = evaluateXPath(doc, '//b/text()') as Text;
+
+    textNode.splitText(16);
+    // console.log([...textNode.parentNode.childNodes].map(node => node.textContent))
+    // → [ 'l😃rem ipsum dol', 'or amet yada yada' ]
+
+    await testMatcher(doc, scope, selector, [
+      {
+        startContainerXPath: '//b/text()[1]',
+        startOffset: 13,
+        endContainerXPath: '//b/text()[2]',
+        endOffset: 5,
+      },
+    ]);
+  });
+
+  it('handles empty text nodes', async () => {
+    const { html, selector } = testCases['simple'];
+    const doc = domParser.parseFromString(html, 'text/html');
+
+    const scope = doc.createRange();
+    scope.selectNodeContents(doc);
+
+    const textNode = evaluateXPath(doc, '//b/text()') as Text;
+    textNode.splitText(textNode.length);
+    textNode.splitText(21);
+    textNode.splitText(21);
+    textNode.splitText(18);
+    textNode.splitText(18);
+    textNode.splitText(13);
+    textNode.splitText(13);
+    textNode.splitText(0);
+    // console.log([...textNode.parentNode.childNodes].map(node => node.textContent))
+    // → [ '', 'l😃rem ipsum ', '', 'dolor', '', ' am', '', 'et yada yada', '' ]
+
+
+    await testMatcher(doc, scope, selector, [
+      {
+        startContainerXPath: '//b/text()[4]', // "dolor"
+        startOffset: 0,
+        endContainerXPath: '//b/text()[8]', // "et yada yada"
+        endOffset: 0,
+      },
+    ]);
+  });
+
+  it('works when scope spans one text node’s contents, matching its first characters', async () => {
+    const { html, selector, expected } = testCases['first characters'];
+    const doc = domParser.parseFromString(html, 'text/html');
+
+    const scope = doc.createRange();
+    scope.selectNodeContents(evaluateXPath(doc, '//b/text()'));
+
+    await testMatcher(doc, scope, selector, expected);
+  });
+
+  it('works when scope starts with an empty text node, matching its first characters', async () => {
+    const { html, selector } = testCases['first characters'];
+    const doc = domParser.parseFromString(html, 'text/html');
+
+    const textNode = evaluateXPath(doc, '//b/text()') as Text;
+    textNode.splitText(0);
+
+    const scope = doc.createRange();
+    scope.selectNodeContents(evaluateXPath(doc, '//b'));
+
+    await testMatcher(doc, scope, selector, [
+      {
+        startContainerXPath: '//b/text()[2]',
+        startOffset: 0,
+        endContainerXPath: '//b/text()[2]',
+        endOffset: 12,
+      },
+    ]);
+  });
+
+  it('works when scope has both ends within one text node', async () => {
+    const { html, expected } = testCases['simple'];
+
+    const doc = domParser.parseFromString(html, 'text/html');
+
+    // Use the substring ‘ipsum dolor amet’ as scope.
+    const scope = doc.createRange();
+    scope.setStart(evaluateXPath(doc, '//b/text()'), 7);
+    scope.setEnd(evaluateXPath(doc, '//b/text()'), 23);
+
+    const selector: TextPositionSelector = {
+      type: 'TextPositionSelector',
+      start: 6,
+      end: 14,
+    };
+
+    await testMatcher(doc, scope, selector, expected);
+  });
+
+  it('works when scope has both ends inside text nodes', async () => {
+    const { html, expected } = testCases['across elements'];
+    const doc = domParser.parseFromString(html, 'text/html');
+
+    // Use the substring ‘sum dolor am’ as scope.
+    const scope = doc.createRange();
+    scope.setStart(evaluateXPath(doc, '//i/text()'), 2);
+    scope.setEnd(evaluateXPath(doc, '//u/text()'), 2);
+
+    const selector: TextPositionSelector = {
+      type: 'TextPositionSelector',
+      start: 4,
+      end: 12,
+    };
+
+    await testMatcher(doc, scope, selector, expected);
+  });
+
+  it('works when scope has both ends inside an element', async () => {
+    const { html, expected } = testCases['across elements'];
+    const doc = domParser.parseFromString(html, 'text/html');
+
+    const scope = doc.createRange();
+    scope.setStart(evaluateXPath(doc, '//b'), 1); // before the <i>
+    scope.setEnd(evaluateXPath(doc, '//b'), 4); // before the " yada yada"
+    const selector: TextPositionSelector = {
+      type: 'TextPositionSelector',
+      start: 6,
+      end: 14,
+    };
+    await testMatcher(doc, scope, selector, expected);
+  });
+});
+
+async function testMatcher(
+  doc: Document,
+  scope: Range,
+  selector: TextPositionSelector,
+  expected: RangeInfo[],
+) {
+  const matcher = createTextPositionSelectorMatcher(selector);
+  const matches = [];
+  for await (const value of matcher(scope)) matches.push(value);
+  assert.equal(matches.length, expected.length);
+  matches.forEach((match, i) => {
+    const expectedRange = expected[i];
+    const expectedStartContainer = evaluateXPath(
+      doc,
+      expectedRange.startContainerXPath,
+    );
+    const expectedEndContainer = evaluateXPath(
+      doc,
+      expectedRange.endContainerXPath,
+    );
+    assert(
+      match.startContainer === expectedStartContainer,
+      `unexpected start container: ${prettyNodeName(match.startContainer)}; ` +
+        `expected ${prettyNodeName(expectedStartContainer)}`,
+    );
+    assert.equal(match.startOffset, expectedRange.startOffset);
+    assert(
+      match.endContainer ===
+        evaluateXPath(doc, expectedRange.endContainerXPath),
+      `unexpected end container: ${prettyNodeName(match.endContainer)}; ` +
+        `expected ${prettyNodeName(expectedEndContainer)}`,
+    );
+    assert.equal(match.endOffset, expectedRange.endOffset);
+  });
+}
+
+function prettyNodeName(node: Node) {
+  switch (node.nodeType) {
+    case Node.TEXT_NODE: {
+      const text = (node as Text).nodeValue || '';
+      return `#text "${text.length > 50 ? text.substring(0, 50) + '…' : text}"`;
+    }
+    case Node.ELEMENT_NODE:
+      return `<${(node as Element).tagName.toLowerCase()}>`;
+    default:
+      return node.nodeName.toLowerCase();
+  }
+}
diff --git a/packages/selector/src/index.ts b/packages/selector/src/index.ts
index c66bd94..ffab70b 100644
--- a/packages/selector/src/index.ts
+++ b/packages/selector/src/index.ts
@@ -21,7 +21,7 @@
 import type { Matcher, Selector } from './types';
 
 export type { Matcher, Selector } from './types';
-export type { CssSelector, RangeSelector, TextQuoteSelector } from './types';
+export type { CssSelector, RangeSelector, TextPositionSelector, TextQuoteSelector } from './types';
 
 export function makeRefinable<
   // Any subtype of Selector can be made refinable; but note we limit the value
diff --git a/packages/selector/src/types.ts b/packages/selector/src/types.ts
index fc4f64b..e57fed0 100644
--- a/packages/selector/src/types.ts
+++ b/packages/selector/src/types.ts
@@ -34,6 +34,12 @@ export interface TextQuoteSelector extends Selector {
   suffix?: string;
 }
 
+export interface TextPositionSelector extends Selector {
+  type: 'TextPositionSelector';
+  start: number; // more precisely: non-negative integer
+  end: number; // more precisely: non-negative integer
+}
+
 export interface RangeSelector extends Selector {
   type: 'RangeSelector';
   startSelector: Selector;