You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@annotator.apache.org by ge...@apache.org on 2020/10/08 21:19:13 UTC

[incubator-annotator] branch text-position created (now ccfd928)

This is an automated email from the ASF dual-hosted git repository.

gerben pushed a change to branch text-position
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git.


      at ccfd928  Implement text-position matching.

This branch includes the following new commits:

     new ccfd928  Implement text-position matching.

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[incubator-annotator] 01/01: Implement text-position matching.

Posted by ge...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch text-position
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git

commit ccfd9288cf17331dba46330e899122009a7c7f8d
Author: Gerben <ge...@treora.com>
AuthorDate: Thu Oct 8 23:15:05 2020 +0200

    Implement text-position matching.
    
    Largely copying from the text quote implementation.
    No effort done yet regarding deduplication, abstraction, efficiency.
---
 .../types.ts => dom/src/text-position/index.ts}    |  26 +--
 packages/dom/src/text-position/match.ts            |  88 ++++++++
 packages/dom/test/text-position/match-cases.ts     | 142 +++++++++++++
 packages/dom/test/text-position/match.test.ts      | 227 +++++++++++++++++++++
 packages/selector/src/index.ts                     |   2 +-
 packages/selector/src/types.ts                     |   6 +
 6 files changed, 465 insertions(+), 26 deletions(-)

diff --git a/packages/selector/src/types.ts b/packages/dom/src/text-position/index.ts
similarity index 61%
copy from packages/selector/src/types.ts
copy to packages/dom/src/text-position/index.ts
index fc4f64b..011e994 100644
--- a/packages/selector/src/types.ts
+++ b/packages/dom/src/text-position/index.ts
@@ -18,28 +18,4 @@
  * under the License.
  */
 
-export interface Selector {
-  refinedBy?: Selector;
-}
-
-export interface CssSelector extends Selector {
-  type: 'CssSelector';
-  value: string;
-}
-
-export interface TextQuoteSelector extends Selector {
-  type: 'TextQuoteSelector';
-  exact: string;
-  prefix?: string;
-  suffix?: string;
-}
-
-export interface RangeSelector extends Selector {
-  type: 'RangeSelector';
-  startSelector: Selector;
-  endSelector: Selector;
-}
-
-export interface Matcher<TScope, TMatch> {
-  (scope: TScope): AsyncGenerator<TMatch, void, void>;
-}
+export * from './match';
diff --git a/packages/dom/src/text-position/match.ts b/packages/dom/src/text-position/match.ts
new file mode 100644
index 0000000..a579e94
--- /dev/null
+++ b/packages/dom/src/text-position/match.ts
@@ -0,0 +1,88 @@
+/**
+ * @license
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import seek from 'dom-seek';
+import type { Matcher, TextPositionSelector } from '@annotator/selector';
+import { ownerDocument } from '../owner-document';
+
+export function createTextPositionSelectorMatcher(
+  selector: TextPositionSelector,
+): Matcher<Range, Range> {
+  return async function* matchAll(scope) {
+    const document = ownerDocument(scope);
+    const scopeText = scope.toString();
+
+    const { start, end } = selector;
+
+    const iter = document.createNodeIterator(
+      scope.commonAncestorContainer,
+      NodeFilter.SHOW_TEXT,
+      {
+        acceptNode(node: Text) {
+          // Only reveal nodes within the range; and skip any empty text nodes.
+          return scope.intersectsNode(node) && node.length > 0
+            ? NodeFilter.FILTER_ACCEPT
+            : NodeFilter.FILTER_REJECT;
+        },
+      },
+    );
+
+    // The index of the first character of iter.referenceNode inside the text.
+    let referenceNodeIndex = isTextNode(scope.startContainer)
+      ? -scope.startOffset
+      : 0;
+
+    // String indices are based on code points, not code units, so we actually have to count.
+    const matchStartIndex = getIndexOfCharacterNumber(scopeText, start);
+    const matchEndIndex = getIndexOfCharacterNumber(scopeText, end);
+
+    // Create a range to represent the described text in the dom.
+    const match = document.createRange();
+
+    // Seek to the start of the match, make the range start there.
+    referenceNodeIndex += seek(iter, matchStartIndex - referenceNodeIndex);
+    match.setStart(iter.referenceNode, matchStartIndex - referenceNodeIndex);
+
+    // Seek to the end of the match, make the range end there.
+    referenceNodeIndex += seek(iter, matchEndIndex - referenceNodeIndex);
+    match.setEnd(iter.referenceNode, matchEndIndex - referenceNodeIndex);
+
+    // Yield the match.
+    yield match;
+  };
+}
+
+function isTextNode(node: Node): node is Text {
+  return node.nodeType === Node.TEXT_NODE;
+}
+
+function getIndexOfCharacterNumber(text: string, characterNumber: number): number {
+  let index = 0;
+  let characterCount = 0;
+  for (let character of text) {
+    if (characterCount >= characterNumber) // using >= to avoid infinite loop on invalid input.
+      break;
+    index += character.length; // note the length is either 1 or 2
+    characterCount++;
+  }
+  if (characterCount === characterNumber)
+    return index;
+  throw new RangeError;
+}
diff --git a/packages/dom/test/text-position/match-cases.ts b/packages/dom/test/text-position/match-cases.ts
new file mode 100644
index 0000000..0916446
--- /dev/null
+++ b/packages/dom/test/text-position/match-cases.ts
@@ -0,0 +1,142 @@
+/**
+ * @license
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import type { TextPositionSelector } from '@annotator/selector';
+import type { RangeInfo } from '../utils';
+
+export const testCases: {
+  [name: string]: {
+    html: string;
+    selector: TextPositionSelector;
+    expected: RangeInfo[];
+  };
+} = {
+  simple: {
+    html: '<b>l😃rem ipsum dolor amet yada yada</b>',
+    selector: {
+      type: 'TextPositionSelector',
+      start: 12,
+      end: 20,
+    },
+    expected: [
+      {
+        startContainerXPath: '//b/text()',
+        startOffset: 13,
+        endContainerXPath: '//b/text()',
+        endOffset: 21,
+      },
+    ],
+  },
+  'first characters': {
+    html: '<b>l😃rem ipsum dolor amet yada yada</b>',
+    selector: {
+      type: 'TextPositionSelector',
+      start: 0,
+      end: 11,
+    },
+    expected: [
+      {
+        startContainerXPath: '//b/text()',
+        startOffset: 0,
+        endContainerXPath: '//b/text()',
+        endOffset: 12,
+      },
+    ],
+  },
+  'last characters': {
+    html: '<b>l😃rem ipsum dolor amet yada yada</b>',
+    selector: {
+      type: 'TextPositionSelector',
+      start: 23,
+      end: 32,
+    },
+    expected: [
+      {
+        startContainerXPath: '//b/text()',
+        startOffset: 24,
+        endContainerXPath: '//b/text()',
+        endOffset: 33,
+      },
+    ],
+  },
+  'across elements': {
+    html: '<b>l😃rem <i>ipsum</i> dolor <u>amet</u> yada yada</b>',
+    selector: {
+      type: 'TextPositionSelector',
+      start: 12,
+      end: 20,
+    },
+    expected: [
+      {
+        startContainerXPath: '//b/text()[2]',
+        startOffset: 1,
+        endContainerXPath: '//u/text()',
+        endOffset: 2,
+      },
+    ],
+  },
+  'exact element contents': {
+    html: '<b>l😃rem <i>ipsum dolor</i> amet yada yada</b>',
+    selector: {
+      type: 'TextPositionSelector',
+      start: 6,
+      end: 17,
+    },
+    expected: [
+      {
+        startContainerXPath: '//i/text()',
+        startOffset: 0,
+        endContainerXPath: '//b/text()[2]',
+        endOffset: 0,
+      },
+    ],
+  },
+  'text inside <head>': {
+    html:
+      '<head><title>l😃rem ipsum dolor amet</title></head><b>yada yada</b>',
+    selector: {
+      type: 'TextPositionSelector',
+      start: 18,
+      end: 22,
+    },
+    expected: [
+      {
+        startContainerXPath: '//title/text()',
+        startOffset: 19,
+        endContainerXPath: '//b/text()[1]',
+        endOffset: 0,
+      },
+    ],
+  },
+  'empty quote': {
+    html: '<b>l😃rem</b>',
+    selector: {
+      type: 'TextPositionSelector',
+      start: 3,
+      end: 3,
+    },
+    expected: [{
+      startContainerXPath: '//b/text()',
+      startOffset: 4,
+      endContainerXPath: '//b/text()',
+      endOffset: 4,
+    }],
+  },
+};
diff --git a/packages/dom/test/text-position/match.test.ts b/packages/dom/test/text-position/match.test.ts
new file mode 100644
index 0000000..1acaed0
--- /dev/null
+++ b/packages/dom/test/text-position/match.test.ts
@@ -0,0 +1,227 @@
+/**
+ * @license
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { assert } from 'chai';
+import type { TextPositionSelector } from '@annotator/selector';
+import { createTextPositionSelectorMatcher } from '../../src/text-position/match';
+import { evaluateXPath } from '../utils';
+import type { RangeInfo } from '../utils';
+import { testCases } from './match-cases';
+
+const domParser = new window.DOMParser();
+
+describe('createTextPositionSelectorMatcher', () => {
+  for (const [name, { html, selector, expected }] of Object.entries(
+    testCases,
+  )) {
+    it(`works for case: '${name}'`, async () => {
+      const doc = domParser.parseFromString(html, 'text/html');
+
+      const scope = doc.createRange();
+      scope.selectNodeContents(doc);
+
+      await testMatcher(doc, scope, selector, expected);
+    });
+  }
+
+  it('handles adjacent text nodes', async () => {
+    const { html, selector } = testCases['simple'];
+    const doc = domParser.parseFromString(html, 'text/html');
+
+    const scope = doc.createRange();
+    scope.selectNodeContents(doc);
+
+    const textNode = evaluateXPath(doc, '//b/text()') as Text;
+
+    textNode.splitText(16);
+    // console.log([...textNode.parentNode.childNodes].map(node => node.textContent))
+    // → [ 'l😃rem ipsum dol', 'or amet yada yada' ]
+
+    await testMatcher(doc, scope, selector, [
+      {
+        startContainerXPath: '//b/text()[1]',
+        startOffset: 13,
+        endContainerXPath: '//b/text()[2]',
+        endOffset: 5,
+      },
+    ]);
+  });
+
+  it('handles empty text nodes', async () => {
+    const { html, selector } = testCases['simple'];
+    const doc = domParser.parseFromString(html, 'text/html');
+
+    const scope = doc.createRange();
+    scope.selectNodeContents(doc);
+
+    const textNode = evaluateXPath(doc, '//b/text()') as Text;
+    textNode.splitText(textNode.length);
+    textNode.splitText(21);
+    textNode.splitText(21);
+    textNode.splitText(18);
+    textNode.splitText(18);
+    textNode.splitText(13);
+    textNode.splitText(13);
+    textNode.splitText(0);
+    // console.log([...textNode.parentNode.childNodes].map(node => node.textContent))
+    // → [ '', 'l😃rem ipsum ', '', 'dolor', '', ' am', '', 'et yada yada', '' ]
+
+
+    await testMatcher(doc, scope, selector, [
+      {
+        startContainerXPath: '//b/text()[4]', // "dolor"
+        startOffset: 0,
+        endContainerXPath: '//b/text()[8]', // "et yada yada"
+        endOffset: 0,
+      },
+    ]);
+  });
+
+  it('works when scope spans one text node’s contents, matching its first characters', async () => {
+    const { html, selector, expected } = testCases['first characters'];
+    const doc = domParser.parseFromString(html, 'text/html');
+
+    const scope = doc.createRange();
+    scope.selectNodeContents(evaluateXPath(doc, '//b/text()'));
+
+    await testMatcher(doc, scope, selector, expected);
+  });
+
+  it('works when scope starts with an empty text node, matching its first characters', async () => {
+    const { html, selector } = testCases['first characters'];
+    const doc = domParser.parseFromString(html, 'text/html');
+
+    const textNode = evaluateXPath(doc, '//b/text()') as Text;
+    textNode.splitText(0);
+
+    const scope = doc.createRange();
+    scope.selectNodeContents(evaluateXPath(doc, '//b'));
+
+    await testMatcher(doc, scope, selector, [
+      {
+        startContainerXPath: '//b/text()[2]',
+        startOffset: 0,
+        endContainerXPath: '//b/text()[2]',
+        endOffset: 12,
+      },
+    ]);
+  });
+
+  it('works when scope has both ends within one text node', async () => {
+    const { html, expected } = testCases['simple'];
+
+    const doc = domParser.parseFromString(html, 'text/html');
+
+    // Use the substring ‘ipsum dolor amet’ as scope.
+    const scope = doc.createRange();
+    scope.setStart(evaluateXPath(doc, '//b/text()'), 7);
+    scope.setEnd(evaluateXPath(doc, '//b/text()'), 23);
+
+    const selector: TextPositionSelector = {
+      type: 'TextPositionSelector',
+      start: 6,
+      end: 14,
+    };
+
+    await testMatcher(doc, scope, selector, expected);
+  });
+
+  it('works when scope has both ends inside text nodes', async () => {
+    const { html, expected } = testCases['across elements'];
+    const doc = domParser.parseFromString(html, 'text/html');
+
+    // Use the substring ‘sum dolor am’ as scope.
+    const scope = doc.createRange();
+    scope.setStart(evaluateXPath(doc, '//i/text()'), 2);
+    scope.setEnd(evaluateXPath(doc, '//u/text()'), 2);
+
+    const selector: TextPositionSelector = {
+      type: 'TextPositionSelector',
+      start: 4,
+      end: 12,
+    };
+
+    await testMatcher(doc, scope, selector, expected);
+  });
+
+  it('works when scope has both ends inside an element', async () => {
+    const { html, expected } = testCases['across elements'];
+    const doc = domParser.parseFromString(html, 'text/html');
+
+    const scope = doc.createRange();
+    scope.setStart(evaluateXPath(doc, '//b'), 1); // before the <i>
+    scope.setEnd(evaluateXPath(doc, '//b'), 4); // before the " yada yada"
+    const selector: TextPositionSelector = {
+      type: 'TextPositionSelector',
+      start: 6,
+      end: 14,
+    };
+    await testMatcher(doc, scope, selector, expected);
+  });
+});
+
+async function testMatcher(
+  doc: Document,
+  scope: Range,
+  selector: TextPositionSelector,
+  expected: RangeInfo[],
+) {
+  const matcher = createTextPositionSelectorMatcher(selector);
+  const matches = [];
+  for await (const value of matcher(scope)) matches.push(value);
+  assert.equal(matches.length, expected.length);
+  matches.forEach((match, i) => {
+    const expectedRange = expected[i];
+    const expectedStartContainer = evaluateXPath(
+      doc,
+      expectedRange.startContainerXPath,
+    );
+    const expectedEndContainer = evaluateXPath(
+      doc,
+      expectedRange.endContainerXPath,
+    );
+    assert(
+      match.startContainer === expectedStartContainer,
+      `unexpected start container: ${prettyNodeName(match.startContainer)}; ` +
+        `expected ${prettyNodeName(expectedStartContainer)}`,
+    );
+    assert.equal(match.startOffset, expectedRange.startOffset);
+    assert(
+      match.endContainer ===
+        evaluateXPath(doc, expectedRange.endContainerXPath),
+      `unexpected end container: ${prettyNodeName(match.endContainer)}; ` +
+        `expected ${prettyNodeName(expectedEndContainer)}`,
+    );
+    assert.equal(match.endOffset, expectedRange.endOffset);
+  });
+}
+
+function prettyNodeName(node: Node) {
+  switch (node.nodeType) {
+    case Node.TEXT_NODE: {
+      const text = (node as Text).nodeValue || '';
+      return `#text "${text.length > 50 ? text.substring(0, 50) + '…' : text}"`;
+    }
+    case Node.ELEMENT_NODE:
+      return `<${(node as Element).tagName.toLowerCase()}>`;
+    default:
+      return node.nodeName.toLowerCase();
+  }
+}
diff --git a/packages/selector/src/index.ts b/packages/selector/src/index.ts
index c66bd94..ffab70b 100644
--- a/packages/selector/src/index.ts
+++ b/packages/selector/src/index.ts
@@ -21,7 +21,7 @@
 import type { Matcher, Selector } from './types';
 
 export type { Matcher, Selector } from './types';
-export type { CssSelector, RangeSelector, TextQuoteSelector } from './types';
+export type { CssSelector, RangeSelector, TextPositionSelector, TextQuoteSelector } from './types';
 
 export function makeRefinable<
   // Any subtype of Selector can be made refinable; but note we limit the value
diff --git a/packages/selector/src/types.ts b/packages/selector/src/types.ts
index fc4f64b..e57fed0 100644
--- a/packages/selector/src/types.ts
+++ b/packages/selector/src/types.ts
@@ -34,6 +34,12 @@ export interface TextQuoteSelector extends Selector {
   suffix?: string;
 }
 
+export interface TextPositionSelector extends Selector {
+  type: 'TextPositionSelector';
+  start: number; // more precisely: non-negative integer
+  end: number; // more precisely: non-negative integer
+}
+
 export interface RangeSelector extends Selector {
   type: 'RangeSelector';
   startSelector: Selector;