You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@annotator.apache.org by ge...@apache.org on 2020/11/20 21:21:11 UTC
[incubator-annotator] 13/14: Move abstract code into
@annotator/selector
This is an automated email from the ASF dual-hosted git repository.
gerben pushed a commit to branch import-dom-seek
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git
commit 6ee4ff8992c68366d8c0d3120945a94a45298d88
Author: Gerben <ge...@treora.com>
AuthorDate: Fri Nov 20 16:40:19 2020 +0100
Move abstract code into @annotator/selector
---
packages/dom/src/chunker.ts | 51 +------
packages/dom/src/text-position/describe.ts | 22 +--
packages/dom/src/text-position/match.ts | 28 +---
packages/dom/src/text-quote/describe.ts | 115 +-------------
packages/dom/src/text-quote/match.ts | 148 +-----------------
packages/selector/src/index.ts | 1 +
packages/selector/src/text/chunker.ts | 69 +++++++++
.../src => selector/src/text}/code-point-seeker.ts | 0
.../src/text/describe-text-position.ts} | 34 +----
packages/selector/src/text/describe-text-quote.ts | 134 ++++++++++++++++
packages/selector/src/text/index.ts | 5 +
.../src/text/match-text-position.ts} | 27 +---
packages/selector/src/text/match-text-quote.ts | 168 +++++++++++++++++++++
packages/{dom/src => selector/src/text}/seek.ts | 0
14 files changed, 392 insertions(+), 410 deletions(-)
diff --git a/packages/dom/src/chunker.ts b/packages/dom/src/chunker.ts
index 8c56924..bc5c2c1 100644
--- a/packages/dom/src/chunker.ts
+++ b/packages/dom/src/chunker.ts
@@ -18,59 +18,10 @@
* under the License.
*/
+import type { Chunk, Chunker, ChunkRange } from '@annotator/selector';
import { normalizeRange } from './normalize-range';
import { ownerDocument } from './owner-document';
-// A Chunk represents a fragment (typically a string) of some document.
-// Subclasses can add further attributes to map the chunk to its position in the
-// data structure it came from (e.g. a DOM node).
-export interface Chunk<TData> {
- readonly data: TData;
- equals?(otherChunk: this): boolean;
-}
-
-export interface ChunkRange<TChunk extends Chunk<any>> {
- startChunk: TChunk;
- startIndex: number;
- endChunk: TChunk;
- endIndex: number;
-}
-
-export function chunkEquals(chunk1: Chunk<any>, chunk2: Chunk<any>): boolean {
- return chunk1.equals ? chunk1.equals(chunk2) : chunk1 === chunk2;
-}
-
-export function chunkRangeEquals(
- range1: ChunkRange<any>,
- range2: ChunkRange<any>,
-): boolean {
- return (
- chunkEquals(range1.startChunk, range2.startChunk) &&
- chunkEquals(range1.endChunk, range2.endChunk) &&
- range1.startIndex === range2.startIndex &&
- range1.endIndex === range2.endIndex
- );
-}
-
-// A Chunker lets one walk through the chunks of a document.
-// It is inspired by, and similar to, the DOM’s NodeIterator. (but unlike
-// NodeIterator, it has no concept of being ‘before’ or ‘after’ a chunk)
-export interface Chunker<TChunk extends Chunk<any>> {
- // The chunk currently being pointed at.
- readonly currentChunk: TChunk;
-
- // Move currentChunk to the chunk following it, and return that chunk.
- // If there are no chunks following it, keep currentChunk unchanged and return null.
- nextChunk(): TChunk | null;
-
- // Move currentChunk to the chunk preceding it, and return that chunk.
- // If there are no preceding chunks, keep currentChunk unchanged and return null.
- previousChunk(): TChunk | null;
-
- // Test if a given chunk is before the current chunk.
- precedesCurrentChunk(chunk: TChunk): boolean;
-}
-
export interface PartialTextNode extends Chunk<string> {
readonly node: Text;
readonly startOffset: number;
diff --git a/packages/dom/src/text-position/describe.ts b/packages/dom/src/text-position/describe.ts
index 5f7f9a3..992a633 100644
--- a/packages/dom/src/text-position/describe.ts
+++ b/packages/dom/src/text-position/describe.ts
@@ -19,11 +19,9 @@
*/
import type { TextPositionSelector } from '@annotator/selector';
-import type { Chunk, Chunker, ChunkRange } from '../chunker';
+import { describeTextPosition as abstractDescribeTextPosition } from '@annotator/selector';
import { TextNodeChunker } from '../chunker';
-import { CodePointSeeker } from '../code-point-seeker';
import { ownerDocument } from '../owner-document';
-import { TextSeeker } from '../seek';
export async function describeTextPosition(
range: Range,
@@ -48,21 +46,3 @@ export async function describeTextPosition(
textChunks,
);
}
-
-async function abstractDescribeTextPosition<TChunk extends Chunk<string>>(
- target: ChunkRange<TChunk>,
- scope: Chunker<TChunk>,
-): Promise<TextPositionSelector> {
- const codeUnitSeeker = new TextSeeker(scope);
- const codePointSeeker = new CodePointSeeker(codeUnitSeeker);
-
- codePointSeeker.seekToChunk(target.startChunk, target.startIndex);
- const start = codePointSeeker.position;
- codePointSeeker.seekToChunk(target.endChunk, target.endIndex);
- const end = codePointSeeker.position;
- return {
- type: 'TextPositionSelector',
- start,
- end,
- };
-}
diff --git a/packages/dom/src/text-position/match.ts b/packages/dom/src/text-position/match.ts
index becd957..3dccede 100644
--- a/packages/dom/src/text-position/match.ts
+++ b/packages/dom/src/text-position/match.ts
@@ -19,10 +19,8 @@
*/
import type { Matcher, TextPositionSelector } from '@annotator/selector';
-import type { Chunk, ChunkRange, Chunker } from '../chunker';
+import { textPositionSelectorMatcher as abstractTextPositionSelectorMatcher } from '@annotator/selector';
import { TextNodeChunker } from '../chunker';
-import { CodePointSeeker } from '../code-point-seeker';
-import { TextSeeker } from '../seek';
export function createTextPositionSelectorMatcher(
selector: TextPositionSelector,
@@ -39,27 +37,3 @@ export function createTextPositionSelectorMatcher(
}
};
}
-
-export function abstractTextPositionSelectorMatcher(
- selector: TextPositionSelector,
-): <TChunk extends Chunk<any>>(
- scope: Chunker<TChunk>,
-) => AsyncGenerator<ChunkRange<TChunk>, void, void> {
- const { start, end } = selector;
-
- return async function* matchAll<TChunk extends Chunk<string>>(
- textChunks: Chunker<TChunk>,
- ) {
- const codeUnitSeeker = new TextSeeker(textChunks);
- const codePointSeeker = new CodePointSeeker(codeUnitSeeker);
-
- codePointSeeker.seekTo(start);
- const startChunk = codeUnitSeeker.currentChunk;
- const startIndex = codeUnitSeeker.offsetInChunk;
- codePointSeeker.seekTo(end);
- const endChunk = codeUnitSeeker.currentChunk;
- const endIndex = codeUnitSeeker.offsetInChunk;
-
- yield { startChunk, startIndex, endChunk, endIndex };
- };
-}
diff --git a/packages/dom/src/text-quote/describe.ts b/packages/dom/src/text-quote/describe.ts
index 3dfa45e..3ca3f0b 100644
--- a/packages/dom/src/text-quote/describe.ts
+++ b/packages/dom/src/text-quote/describe.ts
@@ -19,12 +19,9 @@
*/
import type { TextQuoteSelector } from '@annotator/selector';
-import type { Chunk, Chunker, ChunkRange } from '../chunker';
-import { TextNodeChunker, chunkRangeEquals } from '../chunker';
+import { describeTextQuote as abstractDescribeTextQuote } from '@annotator/selector';
+import { TextNodeChunker } from '../chunker';
import { ownerDocument } from '../owner-document';
-import type { Seeker } from '../seek';
-import { TextSeeker } from '../seek';
-import { abstractTextQuoteSelectorMatcher } from '.';
export async function describeTextQuote(
range: Range,
@@ -47,111 +44,3 @@ export async function describeTextQuote(
() => new TextNodeChunker(scope),
);
}
-
-async function abstractDescribeTextQuote<TChunk extends Chunk<string>>(
- target: ChunkRange<TChunk>,
- scope: () => Chunker<TChunk>,
-): Promise<TextQuoteSelector> {
- const seeker = new TextSeeker(scope());
-
- // Read the target’s exact text.
- seeker.seekToChunk(target.startChunk, target.startIndex);
- const exact = seeker.readToChunk(target.endChunk, target.endIndex);
-
- // Starting with an empty prefix and suffix, we search for matches. At each unintended match
- // we encounter, we extend the prefix or suffix just enough to ensure it will no longer match.
- let prefix = '';
- let suffix = '';
-
- while (true) {
- const tentativeSelector: TextQuoteSelector = {
- type: 'TextQuoteSelector',
- exact,
- prefix,
- suffix,
- };
-
- const matches = abstractTextQuoteSelectorMatcher(tentativeSelector)(
- scope(),
- );
- let nextMatch = await matches.next();
-
- // If this match is the intended one, no need to act.
- // XXX This test is fragile: nextMatch and target are assumed to be normalised.
- if (!nextMatch.done && chunkRangeEquals(nextMatch.value, target)) {
- nextMatch = await matches.next();
- }
-
- // If there are no more unintended matches, our selector is unambiguous!
- if (nextMatch.done) return tentativeSelector;
-
- // Possible optimisation: A subsequent search could safely skip the part we
- // already processed, instead of starting from the beginning again. But we’d
- // need the matcher to start at the seeker’s position, instead of searching
- // in the whole current chunk. Then we could just seek back to just after
- // the start of the prefix: seeker.seekBy(-prefix.length + 1); (don’t forget
- // to also correct for any changes in the prefix we will make below)
-
- // We’ll have to add more prefix/suffix to disqualify this unintended match.
- const unintendedMatch = nextMatch.value;
- const seeker1 = new TextSeeker(scope());
- const seeker2 = new TextSeeker(scope());
-
- // Count how many characters we’d need as a prefix to disqualify this match.
- seeker1.seekToChunk(target.startChunk, target.startIndex - prefix.length);
- seeker2.seekToChunk(
- unintendedMatch.startChunk,
- unintendedMatch.startIndex - prefix.length,
- );
- const extraPrefix = readUntilDifferent(seeker1, seeker2, true);
-
- // Count how many characters we’d need as a suffix to disqualify this match.
- seeker1.seekToChunk(target.endChunk, target.endIndex + suffix.length);
- seeker2.seekToChunk(
- unintendedMatch.endChunk,
- unintendedMatch.endIndex + suffix.length,
- );
- const extraSuffix = readUntilDifferent(seeker1, seeker2, false);
-
- // Use either the prefix or suffix, whichever is shortest.
- if (
- extraPrefix !== undefined &&
- (extraSuffix === undefined || extraPrefix.length <= extraSuffix.length)
- ) {
- prefix = extraPrefix + prefix;
- } else if (extraSuffix !== undefined) {
- suffix = suffix + extraSuffix;
- } else {
- throw new Error(
- 'Target cannot be disambiguated; how could that have happened‽',
- );
- }
- }
-}
-
-function readUntilDifferent(
- seeker1: Seeker,
- seeker2: Seeker,
- reverse: boolean,
-): string | undefined {
- let result = '';
- while (true) {
- let nextCharacter: string;
- try {
- nextCharacter = seeker1.read(reverse ? -1 : 1);
- } catch (err) {
- return undefined; // Start/end of text reached: cannot expand result.
- }
- result = reverse ? nextCharacter + result : result + nextCharacter;
-
- // Check if the newly added character makes the result differ from the second seeker.
- let comparisonCharacter: string | undefined;
- try {
- comparisonCharacter = seeker2.read(reverse ? -1 : 1);
- } catch (err) {
- // A RangeError would merely mean seeker2 is exhausted.
- if (!(err instanceof RangeError)) throw err;
- }
- if (nextCharacter !== comparisonCharacter) return result;
- }
-}
diff --git a/packages/dom/src/text-quote/match.ts b/packages/dom/src/text-quote/match.ts
index 9e09990..0c16b8a 100644
--- a/packages/dom/src/text-quote/match.ts
+++ b/packages/dom/src/text-quote/match.ts
@@ -19,7 +19,7 @@
*/
import type { Matcher, TextQuoteSelector } from '@annotator/selector';
-import type { Chunk, Chunker, ChunkRange } from '../chunker';
+import { textQuoteSelectorMatcher as abstractTextQuoteSelectorMatcher } from '@annotator/selector';
import { TextNodeChunker, EmptyScopeError } from '../chunker';
export function createTextQuoteSelectorMatcher(
@@ -42,149 +42,3 @@ export function createTextQuoteSelectorMatcher(
}
};
}
-
-export function abstractTextQuoteSelectorMatcher(
- selector: TextQuoteSelector,
-): <TChunk extends Chunk<any>>(
- scope: Chunker<TChunk>,
-) => AsyncGenerator<ChunkRange<TChunk>, void, void> {
- return async function* matchAll<TChunk extends Chunk<string>>(
- textChunks: Chunker<TChunk>,
- ) {
- const exact = selector.exact;
- const prefix = selector.prefix || '';
- const suffix = selector.suffix || '';
- const searchPattern = prefix + exact + suffix;
-
- // The code below runs a loop with three steps:
- // 1. Continue checking any partial matches from the previous chunk(s).
- // 2. Try find the whole pattern in the chunk (possibly multiple times).
- // 3. Check if this chunk ends with a partial match (or even multiple partial matches).
-
- interface PartialMatch {
- startChunk?: TChunk;
- startIndex?: number;
- endChunk?: TChunk;
- endIndex?: number;
- charactersMatched: number;
- }
- let partialMatches: PartialMatch[] = [];
-
- let isFirstChunk = true;
- do {
- const chunk = textChunks.currentChunk;
- const chunkValue = chunk.data;
-
- // 1. Continue checking any partial matches from the previous chunk(s).
- const remainingPartialMatches: typeof partialMatches = [];
- for (const partialMatch of partialMatches) {
- const charactersMatched = partialMatch.charactersMatched;
-
- // If the current chunk contains the start and/or end of the match, record these.
- if (partialMatch.endChunk === undefined) {
- const charactersUntilMatchEnd =
- prefix.length + exact.length - charactersMatched;
- if (charactersUntilMatchEnd <= chunkValue.length) {
- partialMatch.endChunk = chunk;
- partialMatch.endIndex = charactersUntilMatchEnd;
- }
- }
- if (partialMatch.startChunk === undefined) {
- const charactersUntilMatchStart = prefix.length - charactersMatched;
- if (
- charactersUntilMatchStart < chunkValue.length ||
- partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk.
- ) {
- partialMatch.startChunk = chunk;
- partialMatch.startIndex = charactersUntilMatchStart;
- }
- }
-
- const charactersUntilSuffixEnd =
- searchPattern.length - charactersMatched;
- if (charactersUntilSuffixEnd <= chunkValue.length) {
- if (
- chunkValue.startsWith(searchPattern.substring(charactersMatched))
- ) {
- yield partialMatch as ChunkRange<TChunk>; // all fields are certainly defined now.
- }
- } else if (
- chunkValue ===
- searchPattern.substring(
- charactersMatched,
- charactersMatched + chunkValue.length,
- )
- ) {
- // The chunk is too short to complete the match; comparison has to be completed in subsequent chunks.
- partialMatch.charactersMatched += chunkValue.length;
- remainingPartialMatches.push(partialMatch);
- }
- }
- partialMatches = remainingPartialMatches;
-
- // 2. Try find the whole pattern in the chunk (possibly multiple times).
- if (searchPattern.length <= chunkValue.length) {
- let fromIndex = 0;
- while (fromIndex <= chunkValue.length) {
- const patternStartIndex = chunkValue.indexOf(
- searchPattern,
- fromIndex,
- );
- if (patternStartIndex === -1) break;
- fromIndex = patternStartIndex + 1;
-
- // Handle edge case: an empty searchPattern would already have been yielded at the end of the last chunk.
- if (
- patternStartIndex === 0 &&
- searchPattern.length === 0 &&
- !isFirstChunk
- )
- continue;
-
- yield {
- startChunk: chunk,
- startIndex: patternStartIndex + prefix.length,
- endChunk: chunk,
- endIndex: patternStartIndex + prefix.length + exact.length,
- };
- }
- }
-
- // 3. Check if this chunk ends with a partial match (or even multiple partial matches).
- let newPartialMatches: number[] = [];
- const searchStartPoint = Math.max(
- chunkValue.length - searchPattern.length + 1,
- 0,
- );
- for (let i = searchStartPoint; i < chunkValue.length; i++) {
- const character = chunkValue[i];
- newPartialMatches = newPartialMatches.filter(
- (partialMatchStartIndex) =>
- character === searchPattern[i - partialMatchStartIndex],
- );
- if (character === searchPattern[0]) newPartialMatches.push(i);
- }
- for (const partialMatchStartIndex of newPartialMatches) {
- const charactersMatched = chunkValue.length - partialMatchStartIndex;
- const partialMatch: PartialMatch = {
- charactersMatched,
- };
- if (charactersMatched >= prefix.length + exact.length) {
- partialMatch.endChunk = chunk;
- partialMatch.endIndex =
- partialMatchStartIndex + prefix.length + exact.length;
- }
- if (
- charactersMatched > prefix.length ||
- partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk.
- ) {
- partialMatch.startChunk = chunk;
- partialMatch.startIndex = partialMatchStartIndex + prefix.length;
- }
- partialMatches.push(partialMatch);
- }
-
- isFirstChunk = false;
- } while (textChunks.nextChunk() !== null);
- };
-}
diff --git a/packages/selector/src/index.ts b/packages/selector/src/index.ts
index 73caa05..adefd04 100644
--- a/packages/selector/src/index.ts
+++ b/packages/selector/src/index.ts
@@ -27,6 +27,7 @@ export type {
TextPositionSelector,
TextQuoteSelector,
} from './types';
+export * from './text';
export function makeRefinable<
// Any subtype of Selector can be made refinable; but note we limit the value
diff --git a/packages/selector/src/text/chunker.ts b/packages/selector/src/text/chunker.ts
new file mode 100644
index 0000000..eb0d970
--- /dev/null
+++ b/packages/selector/src/text/chunker.ts
@@ -0,0 +1,69 @@
+/**
+ * @license
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// A Chunk represents a fragment (typically a string) of some document.
+// Subclasses can add further attributes to map the chunk to its position in the
+// data structure it came from (e.g. a DOM node).
+export interface Chunk<TData> {
+ readonly data: TData;
+ equals?(otherChunk: this): boolean;
+}
+
+export function chunkEquals(chunk1: Chunk<any>, chunk2: Chunk<any>): boolean {
+ return chunk1.equals ? chunk1.equals(chunk2) : chunk1 === chunk2;
+}
+
+export interface ChunkRange<TChunk extends Chunk<any>> {
+ startChunk: TChunk;
+ startIndex: number;
+ endChunk: TChunk;
+ endIndex: number;
+}
+
+export function chunkRangeEquals(
+ range1: ChunkRange<any>,
+ range2: ChunkRange<any>,
+): boolean {
+ return (
+ chunkEquals(range1.startChunk, range2.startChunk) &&
+ chunkEquals(range1.endChunk, range2.endChunk) &&
+ range1.startIndex === range2.startIndex &&
+ range1.endIndex === range2.endIndex
+ );
+}
+
+// A Chunker lets one walk through the chunks of a document.
+// It is inspired by, and similar to, the DOM’s NodeIterator. (but unlike
+// NodeIterator, it has no concept of being ‘before’ or ‘after’ a chunk)
+export interface Chunker<TChunk extends Chunk<any>> {
+ // The chunk currently being pointed at.
+ readonly currentChunk: TChunk;
+
+ // Move currentChunk to the chunk following it, and return that chunk.
+ // If there are no chunks following it, keep currentChunk unchanged and return null.
+ nextChunk(): TChunk | null;
+
+ // Move currentChunk to the chunk preceding it, and return that chunk.
+ // If there are no preceding chunks, keep currentChunk unchanged and return null.
+ previousChunk(): TChunk | null;
+
+ // Test if a given chunk is before the current chunk.
+ precedesCurrentChunk(chunk: TChunk): boolean;
+}
diff --git a/packages/dom/src/code-point-seeker.ts b/packages/selector/src/text/code-point-seeker.ts
similarity index 100%
rename from packages/dom/src/code-point-seeker.ts
rename to packages/selector/src/text/code-point-seeker.ts
diff --git a/packages/dom/src/text-position/describe.ts b/packages/selector/src/text/describe-text-position.ts
similarity index 58%
copy from packages/dom/src/text-position/describe.ts
copy to packages/selector/src/text/describe-text-position.ts
index 5f7f9a3..99485b0 100644
--- a/packages/dom/src/text-position/describe.ts
+++ b/packages/selector/src/text/describe-text-position.ts
@@ -19,37 +19,11 @@
*/
import type { TextPositionSelector } from '@annotator/selector';
-import type { Chunk, Chunker, ChunkRange } from '../chunker';
-import { TextNodeChunker } from '../chunker';
-import { CodePointSeeker } from '../code-point-seeker';
-import { ownerDocument } from '../owner-document';
-import { TextSeeker } from '../seek';
+import type { Chunk, Chunker, ChunkRange } from './chunker';
+import { CodePointSeeker } from './code-point-seeker';
+import { TextSeeker } from './seek';
-export async function describeTextPosition(
- range: Range,
- maybeScope?: Range,
-): Promise<TextPositionSelector> {
- // Default to search in the whole document.
- let scope: Range;
- if (maybeScope !== undefined) {
- scope = maybeScope;
- } else {
- const document = ownerDocument(range);
- scope = document.createRange();
- scope.selectNodeContents(document);
- }
-
- const textChunks = new TextNodeChunker(scope);
- if (textChunks.currentChunk === null)
- throw new RangeError('Range does not contain any Text nodes.');
-
- return await abstractDescribeTextPosition(
- textChunks.rangeToChunkRange(range),
- textChunks,
- );
-}
-
-async function abstractDescribeTextPosition<TChunk extends Chunk<string>>(
+export async function describeTextPosition<TChunk extends Chunk<string>>(
target: ChunkRange<TChunk>,
scope: Chunker<TChunk>,
): Promise<TextPositionSelector> {
diff --git a/packages/selector/src/text/describe-text-quote.ts b/packages/selector/src/text/describe-text-quote.ts
new file mode 100644
index 0000000..c97a4f8
--- /dev/null
+++ b/packages/selector/src/text/describe-text-quote.ts
@@ -0,0 +1,134 @@
+/**
+ * @license
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import type { TextQuoteSelector } from '@annotator/selector';
+import type { Chunk, Chunker, ChunkRange } from './chunker';
+import { chunkRangeEquals } from './chunker';
+import type { Seeker } from './seek';
+import { TextSeeker } from './seek';
+import { textQuoteSelectorMatcher } from '.';
+
+export async function describeTextQuote<TChunk extends Chunk<string>>(
+ target: ChunkRange<TChunk>,
+ scope: () => Chunker<TChunk>,
+ ): Promise<TextQuoteSelector> {
+ const seeker = new TextSeeker(scope());
+
+ // Read the target’s exact text.
+ seeker.seekToChunk(target.startChunk, target.startIndex);
+ const exact = seeker.readToChunk(target.endChunk, target.endIndex);
+
+ // Starting with an empty prefix and suffix, we search for matches. At each unintended match
+ // we encounter, we extend the prefix or suffix just enough to ensure it will no longer match.
+ let prefix = '';
+ let suffix = '';
+
+ while (true) {
+ const tentativeSelector: TextQuoteSelector = {
+ type: 'TextQuoteSelector',
+ exact,
+ prefix,
+ suffix,
+ };
+
+ const matches = textQuoteSelectorMatcher(tentativeSelector)(
+ scope(),
+ );
+ let nextMatch = await matches.next();
+
+ // If this match is the intended one, no need to act.
+ // XXX This test is fragile: nextMatch and target are assumed to be normalised.
+ if (!nextMatch.done && chunkRangeEquals(nextMatch.value, target)) {
+ nextMatch = await matches.next();
+ }
+
+ // If there are no more unintended matches, our selector is unambiguous!
+ if (nextMatch.done) return tentativeSelector;
+
+ // Possible optimisation: A subsequent search could safely skip the part we
+ // already processed, instead of starting from the beginning again. But we’d
+ // need the matcher to start at the seeker’s position, instead of searching
+ // in the whole current chunk. Then we could just seek back to just after
+ // the start of the prefix: seeker.seekBy(-prefix.length + 1); (don’t forget
+ // to also correct for any changes in the prefix we will make below)
+
+ // We’ll have to add more prefix/suffix to disqualify this unintended match.
+ const unintendedMatch = nextMatch.value;
+ const seeker1 = new TextSeeker(scope());
+ const seeker2 = new TextSeeker(scope());
+
+ // Count how many characters we’d need as a prefix to disqualify this match.
+ seeker1.seekToChunk(target.startChunk, target.startIndex - prefix.length);
+ seeker2.seekToChunk(
+ unintendedMatch.startChunk,
+ unintendedMatch.startIndex - prefix.length,
+ );
+ const extraPrefix = readUntilDifferent(seeker1, seeker2, true);
+
+ // Count how many characters we’d need as a suffix to disqualify this match.
+ seeker1.seekToChunk(target.endChunk, target.endIndex + suffix.length);
+ seeker2.seekToChunk(
+ unintendedMatch.endChunk,
+ unintendedMatch.endIndex + suffix.length,
+ );
+ const extraSuffix = readUntilDifferent(seeker1, seeker2, false);
+
+ // Use either the prefix or suffix, whichever is shortest.
+ if (
+ extraPrefix !== undefined &&
+ (extraSuffix === undefined || extraPrefix.length <= extraSuffix.length)
+ ) {
+ prefix = extraPrefix + prefix;
+ } else if (extraSuffix !== undefined) {
+ suffix = suffix + extraSuffix;
+ } else {
+ throw new Error(
+ 'Target cannot be disambiguated; how could that have happened‽',
+ );
+ }
+ }
+ }
+
+ function readUntilDifferent(
+ seeker1: Seeker,
+ seeker2: Seeker,
+ reverse: boolean,
+ ): string | undefined {
+ let result = '';
+ while (true) {
+ let nextCharacter: string;
+ try {
+ nextCharacter = seeker1.read(reverse ? -1 : 1);
+ } catch (err) {
+ return undefined; // Start/end of text reached: cannot expand result.
+ }
+ result = reverse ? nextCharacter + result : result + nextCharacter;
+
+ // Check if the newly added character makes the result differ from the second seeker.
+ let comparisonCharacter: string | undefined;
+ try {
+ comparisonCharacter = seeker2.read(reverse ? -1 : 1);
+ } catch (err) {
+ // A RangeError would merely mean seeker2 is exhausted.
+ if (!(err instanceof RangeError)) throw err;
+ }
+ if (nextCharacter !== comparisonCharacter) return result;
+ }
+ }
diff --git a/packages/selector/src/text/index.ts b/packages/selector/src/text/index.ts
new file mode 100644
index 0000000..ca88b66
--- /dev/null
+++ b/packages/selector/src/text/index.ts
@@ -0,0 +1,5 @@
+export * from './describe-text-quote';
+export * from './match-text-quote';
+export * from './describe-text-position';
+export * from './match-text-position';
+export * from './chunker';
diff --git a/packages/dom/src/text-position/match.ts b/packages/selector/src/text/match-text-position.ts
similarity index 66%
copy from packages/dom/src/text-position/match.ts
copy to packages/selector/src/text/match-text-position.ts
index becd957..c00f619 100644
--- a/packages/dom/src/text-position/match.ts
+++ b/packages/selector/src/text/match-text-position.ts
@@ -18,29 +18,12 @@
* under the License.
*/
-import type { Matcher, TextPositionSelector } from '@annotator/selector';
-import type { Chunk, ChunkRange, Chunker } from '../chunker';
-import { TextNodeChunker } from '../chunker';
-import { CodePointSeeker } from '../code-point-seeker';
-import { TextSeeker } from '../seek';
+import type { TextPositionSelector } from '@annotator/selector';
+import type { Chunk, ChunkRange, Chunker } from './chunker';
+import { CodePointSeeker } from './code-point-seeker';
+import { TextSeeker } from './seek';
-export function createTextPositionSelectorMatcher(
- selector: TextPositionSelector,
-): Matcher<Range, Range> {
- const abstractMatcher = abstractTextPositionSelectorMatcher(selector);
-
- return async function* matchAll(scope) {
- const textChunks = new TextNodeChunker(scope);
-
- const matches = abstractMatcher(textChunks);
-
- for await (const abstractMatch of matches) {
- yield textChunks.chunkRangeToRange(abstractMatch);
- }
- };
-}
-
-export function abstractTextPositionSelectorMatcher(
+export function textPositionSelectorMatcher(
selector: TextPositionSelector,
): <TChunk extends Chunk<any>>(
scope: Chunker<TChunk>,
diff --git a/packages/selector/src/text/match-text-quote.ts b/packages/selector/src/text/match-text-quote.ts
new file mode 100644
index 0000000..8e90a2f
--- /dev/null
+++ b/packages/selector/src/text/match-text-quote.ts
@@ -0,0 +1,168 @@
+/**
+ * @license
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import type { TextQuoteSelector } from '@annotator/selector';
+import type { Chunk, Chunker, ChunkRange } from './chunker';
+
+export function textQuoteSelectorMatcher(
+ selector: TextQuoteSelector,
+ ): <TChunk extends Chunk<any>>(
+ scope: Chunker<TChunk>,
+ ) => AsyncGenerator<ChunkRange<TChunk>, void, void> {
+ return async function* matchAll<TChunk extends Chunk<string>>(
+ textChunks: Chunker<TChunk>,
+ ) {
+ const exact = selector.exact;
+ const prefix = selector.prefix || '';
+ const suffix = selector.suffix || '';
+ const searchPattern = prefix + exact + suffix;
+
+ // The code below runs a loop with three steps:
+ // 1. Continue checking any partial matches from the previous chunk(s).
+ // 2. Try find the whole pattern in the chunk (possibly multiple times).
+ // 3. Check if this chunk ends with a partial match (or even multiple partial matches).
+
+ interface PartialMatch {
+ startChunk?: TChunk;
+ startIndex?: number;
+ endChunk?: TChunk;
+ endIndex?: number;
+ charactersMatched: number;
+ }
+ let partialMatches: PartialMatch[] = [];
+
+ let isFirstChunk = true;
+ do {
+ const chunk = textChunks.currentChunk;
+ const chunkValue = chunk.data;
+
+ // 1. Continue checking any partial matches from the previous chunk(s).
+ const remainingPartialMatches: typeof partialMatches = [];
+ for (const partialMatch of partialMatches) {
+ const charactersMatched = partialMatch.charactersMatched;
+
+ // If the current chunk contains the start and/or end of the match, record these.
+ if (partialMatch.endChunk === undefined) {
+ const charactersUntilMatchEnd =
+ prefix.length + exact.length - charactersMatched;
+ if (charactersUntilMatchEnd <= chunkValue.length) {
+ partialMatch.endChunk = chunk;
+ partialMatch.endIndex = charactersUntilMatchEnd;
+ }
+ }
+ if (partialMatch.startChunk === undefined) {
+ const charactersUntilMatchStart = prefix.length - charactersMatched;
+ if (
+ charactersUntilMatchStart < chunkValue.length ||
+ partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk.
+ ) {
+ partialMatch.startChunk = chunk;
+ partialMatch.startIndex = charactersUntilMatchStart;
+ }
+ }
+
+ const charactersUntilSuffixEnd =
+ searchPattern.length - charactersMatched;
+ if (charactersUntilSuffixEnd <= chunkValue.length) {
+ if (
+ chunkValue.startsWith(searchPattern.substring(charactersMatched))
+ ) {
+ yield partialMatch as ChunkRange<TChunk>; // all fields are certainly defined now.
+ }
+ } else if (
+ chunkValue ===
+ searchPattern.substring(
+ charactersMatched,
+ charactersMatched + chunkValue.length,
+ )
+ ) {
+ // The chunk is too short to complete the match; comparison has to be completed in subsequent chunks.
+ partialMatch.charactersMatched += chunkValue.length;
+ remainingPartialMatches.push(partialMatch);
+ }
+ }
+ partialMatches = remainingPartialMatches;
+
+ // 2. Try find the whole pattern in the chunk (possibly multiple times).
+ if (searchPattern.length <= chunkValue.length) {
+ let fromIndex = 0;
+ while (fromIndex <= chunkValue.length) {
+ const patternStartIndex = chunkValue.indexOf(
+ searchPattern,
+ fromIndex,
+ );
+ if (patternStartIndex === -1) break;
+ fromIndex = patternStartIndex + 1;
+
+ // Handle edge case: an empty searchPattern would already have been yielded at the end of the last chunk.
+ if (
+ patternStartIndex === 0 &&
+ searchPattern.length === 0 &&
+ !isFirstChunk
+ )
+ continue;
+
+ yield {
+ startChunk: chunk,
+ startIndex: patternStartIndex + prefix.length,
+ endChunk: chunk,
+ endIndex: patternStartIndex + prefix.length + exact.length,
+ };
+ }
+ }
+
+ // 3. Check if this chunk ends with a partial match (or even multiple partial matches).
+ let newPartialMatches: number[] = [];
+ const searchStartPoint = Math.max(
+ chunkValue.length - searchPattern.length + 1,
+ 0,
+ );
+ for (let i = searchStartPoint; i < chunkValue.length; i++) {
+ const character = chunkValue[i];
+ newPartialMatches = newPartialMatches.filter(
+ (partialMatchStartIndex) =>
+ character === searchPattern[i - partialMatchStartIndex],
+ );
+ if (character === searchPattern[0]) newPartialMatches.push(i);
+ }
+ for (const partialMatchStartIndex of newPartialMatches) {
+ const charactersMatched = chunkValue.length - partialMatchStartIndex;
+ const partialMatch: PartialMatch = {
+ charactersMatched,
+ };
+ if (charactersMatched >= prefix.length + exact.length) {
+ partialMatch.endChunk = chunk;
+ partialMatch.endIndex =
+ partialMatchStartIndex + prefix.length + exact.length;
+ }
+ if (
+ charactersMatched > prefix.length ||
+ partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk.
+ ) {
+ partialMatch.startChunk = chunk;
+ partialMatch.startIndex = partialMatchStartIndex + prefix.length;
+ }
+ partialMatches.push(partialMatch);
+ }
+
+ isFirstChunk = false;
+ } while (textChunks.nextChunk() !== null);
+ };
+ }
diff --git a/packages/dom/src/seek.ts b/packages/selector/src/text/seek.ts
similarity index 100%
rename from packages/dom/src/seek.ts
rename to packages/selector/src/text/seek.ts