You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@annotator.apache.org by ge...@apache.org on 2020/11/12 22:08:11 UTC
[incubator-annotator] 03/04: Make describeTextQuote work, mostly.
This is an automated email from the ASF dual-hosted git repository.
gerben pushed a commit to branch import-dom-seek
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git
commit 657daab660a91ce7057136e5417326b7ae42f239
Author: Gerben <ge...@treora.com>
AuthorDate: Thu Nov 12 12:14:24 2020 +0100
Make describeTextQuote work, mostly.
Tests pass, still some issues with empty quotes at chunk edges
---
packages/dom/src/chunker.ts | 4 +-
packages/dom/src/normalize-range.ts | 4 ++
packages/dom/src/seek.ts | 19 ++++---
packages/dom/src/text-quote/describe.ts | 84 +++++++++++++++++++++++-------
packages/dom/src/text-quote/match.ts | 90 ++++++++++++++++++++++-----------
5 files changed, 145 insertions(+), 56 deletions(-)
diff --git a/packages/dom/src/chunker.ts b/packages/dom/src/chunker.ts
index 7209d7a..2becebf 100644
--- a/packages/dom/src/chunker.ts
+++ b/packages/dom/src/chunker.ts
@@ -104,11 +104,13 @@ export class TextNodeChunker implements Chunker<PartialTextNode> {
rangeToChunkRange(range: Range): ChunkRange<PartialTextNode> {
const textRange = normalizeRange(range);
+ // FIXME: normalizeRange can mess up: a collapsed range at the very end of
+ // the chunker’s scope might move to the next text node outside the scope.
const startChunk = this.nodeToChunk(textRange.startContainer);
const startIndex = textRange.startOffset - startChunk.startOffset;
const endChunk = this.nodeToChunk(textRange.endContainer);
- const endIndex = textRange.endOffset - endChunk.endOffset;
+ const endIndex = textRange.endOffset - endChunk.startOffset;
return { startChunk, startIndex, endChunk, endIndex };
}
diff --git a/packages/dom/src/normalize-range.ts b/packages/dom/src/normalize-range.ts
index 8616bce..0fece41 100644
--- a/packages/dom/src/normalize-range.ts
+++ b/packages/dom/src/normalize-range.ts
@@ -48,6 +48,10 @@ export interface TextRange extends Range {
// if there are equivalent text selections it takes the narrowest option (i.e.
// it prefers the start not to be at the end of a text node, and vice versa).
//
+// If there is no text between the start and end, they thus collapse onto one a
+// single position; and if there are multiple equivalent positions, it takes the
+// first one.
+//
// Note that if the given range does not contain non-empty text nodes, it will
// end up pointing at a text node outside of it (after it if possible, else
// before). If the document does not contain any text nodes, an error is thrown.
diff --git a/packages/dom/src/seek.ts b/packages/dom/src/seek.ts
index 1c821a9..36914a7 100644
--- a/packages/dom/src/seek.ts
+++ b/packages/dom/src/seek.ts
@@ -111,15 +111,14 @@ export class TextSeeker<TChunk extends Chunk<string>> implements Seeker<string>
if (this.position <= target) {
while (this.position <= target) { // could be `while (true)`?
- if (!roundUp && target < this.currentChunkPosition + this.currentChunk.data.length) {
- // The target is before the end of the current chunk.
+ if (
+ this.currentChunkPosition + this.currentChunk.data.length <= target
+ || (roundUp && this.offsetInChunk !== 0)
+ ) {
+ // The target is beyond the current chunk.
// (we use < not ≤: if the target is *at* the end of the chunk, possibly
// because the current chunk is empty, we prefer to take the next chunk)
- const newOffset = target - this.currentChunkPosition;
- if (read) result += this.currentChunk.data.substring(this.offsetInChunk, newOffset);
- this.offsetInChunk = newOffset;
- break;
- } else {
+
// Move to the start of the next chunk, while counting the characters of the current one.
if (read) result += this.currentChunk.data.substring(this.offsetInChunk);
const chunkLength = this.currentChunk.data.length;
@@ -140,6 +139,12 @@ export class TextSeeker<TChunk extends Chunk<string>> implements Seeker<string>
else
throw new RangeError(E_END);
}
+ } else {
+ // The target is within the current chunk.
+ const newOffset = target - this.currentChunkPosition;
+ if (read) result += this.currentChunk.data.substring(this.offsetInChunk, newOffset);
+ this.offsetInChunk = newOffset;
+ break;
}
}
} else { // Similar to the if-block, but moving backward in the text.
diff --git a/packages/dom/src/text-quote/describe.ts b/packages/dom/src/text-quote/describe.ts
index cbad0c3..1a7941d 100644
--- a/packages/dom/src/text-quote/describe.ts
+++ b/packages/dom/src/text-quote/describe.ts
@@ -26,17 +26,18 @@ import { TextSeeker, NonEmptyChunker } from '../seek';
export async function describeTextQuote(
range: Range,
- scope?: Range,
+ maybeScope?: Range,
): Promise<TextQuoteSelector> {
// Default to search in the whole document.
- if (scope === undefined) {
+ let scope: Range;
+ if (maybeScope !== undefined) {
+ scope = maybeScope;
+ } else {
const document = ownerDocument(range);
scope = document.createRange();
scope.selectNodeContents(document);
}
- const chunker = new TextNodeChunker(scope);
-
// Take the part of the range that falls within the scope.
range = range.cloneRange();
if (range.compareBoundaryPoints(Range.START_TO_START, scope) === -1)
@@ -44,17 +45,21 @@ export async function describeTextQuote(
if (range.compareBoundaryPoints(Range.END_TO_END, scope) === 1)
range.setEnd(scope.endContainer, scope.endOffset);
+ const chunker = new TextNodeChunker(scope);
+
return await abstractDescribeTextQuote(
chunker.rangeToChunkRange(range),
- chunker,
+ () => new TextNodeChunker(scope),
);
}
async function abstractDescribeTextQuote<TChunk extends Chunk<string>>(
target: ChunkRange<TChunk>,
- scope: Chunker<TChunk>,
+ scope: () => Chunker<TChunk>,
): Promise<TextQuoteSelector> {
- const seeker = new TextSeeker(scope as NonEmptyChunker<TChunk>);
+ const seeker = new TextSeeker(scope() as NonEmptyChunker<TChunk>);
+
+ // Read the target’s exact text.
seeker.seekToChunk(target.startChunk, target.startIndex);
const exact = seeker.readToChunk(target.endChunk, target.endIndex);
@@ -70,7 +75,8 @@ async function abstractDescribeTextQuote<TChunk extends Chunk<string>>(
prefix,
suffix,
}
- const matches = abstractTextQuoteSelectorMatcher(tentativeSelector)(scope);
+
+ const matches = abstractTextQuoteSelectorMatcher(tentativeSelector)(scope());
let nextMatch = await matches.next();
if (!nextMatch.done && chunkRangeEquals(nextMatch.value, target)) {
@@ -81,17 +87,19 @@ async function abstractDescribeTextQuote<TChunk extends Chunk<string>>(
// If there are no more unintended matches, our selector is unambiguous!
if (nextMatch.done) return tentativeSelector;
- // TODO either reset chunker to start from the beginning, or rewind the chunker by previous match’s length.
- // chunker.seekTo(0) or chunker.seek(-prefix)
+ // A subsequent search could safely skip the part we already processed,
+ // we’d need the matcher to start at the seeker’s position, instead of
+ // searching in the whole current chunk.
+ // seeker.seekBy(-prefix.length + 1);
// We’ll have to add more prefix/suffix to disqualify this unintended match.
const unintendedMatch = nextMatch.value;
- const seeker1 = new TextSeeker(scope as NonEmptyChunker<TChunk>);
- const seeker2 = new TextSeeker(scope as NonEmptyChunker<TChunk>); // TODO must clone scope.
+ const seeker1 = new TextSeeker(scope() as NonEmptyChunker<TChunk>);
+ const seeker2 = new TextSeeker(scope() as NonEmptyChunker<TChunk>);
// Count how many characters we’d need as a prefix to disqualify this match.
- seeker1.seekToChunk(target.startChunk, target.startIndex);
- seeker2.seekToChunk(unintendedMatch.startChunk, unintendedMatch.startIndex);
+ seeker1.seekToChunk(target.startChunk, target.startIndex - prefix.length);
+ seeker2.seekToChunk(unintendedMatch.startChunk, unintendedMatch.startIndex - prefix.length);
let sufficientPrefix: string | undefined = prefix;
while (true) {
let previousCharacter: string;
@@ -102,15 +110,53 @@ async function abstractDescribeTextQuote<TChunk extends Chunk<string>>(
break;
}
sufficientPrefix = previousCharacter + sufficientPrefix;
- if (previousCharacter !== seeker2.read(-1)) break;
+
+ // Break if the newly added character makes the prefix unambiguous.
+ try {
+ const unintendedMatchPreviousCharacter = seeker2.read(-1);
+ if (previousCharacter !== unintendedMatchPreviousCharacter) break;
+ } catch (err) {
+ if (err instanceof RangeError)
+ break;
+ else
+ throw err;
+ }
+ }
+
+ // Count how many characters we’d need as a suffix to disqualify this match.
+ seeker1.seekToChunk(target.endChunk, target.endIndex + suffix.length);
+ seeker2.seekToChunk(unintendedMatch.endChunk, unintendedMatch.endIndex + suffix.length);
+ let sufficientSuffix: string | undefined = suffix;
+ while (true) {
+ let nextCharacter: string;
+ try {
+ nextCharacter = seeker1.read(1);
+ } catch (err) {
+ sufficientSuffix = undefined; // End of text reached.
+ break;
+ }
+ sufficientSuffix += nextCharacter;
+
+ // Break if the newly added character makes the suffix unambiguous.
+ try {
+ const unintendedMatchNextCharacter = seeker2.read(1);
+ if (nextCharacter !== unintendedMatchNextCharacter) break;
+ } catch (err) {
+ if (err instanceof RangeError)
+ break;
+ else
+ throw err;
+ }
}
// Use either the prefix or suffix, whichever is shortest.
- if (sufficientPrefix !== undefined && (sufficientSuffix === undefined || sufficientPrefix.length <= sufficientSuffix.length))
- prefix = sufficientPrefix; // chunker.seek(prefix.length - sufficientPrefix.length)
- else if (sufficientSuffix !== undefined)
+ if (sufficientPrefix !== undefined && (sufficientSuffix === undefined || sufficientPrefix.length <= sufficientSuffix.length)) {
+ prefix = sufficientPrefix;
+ // seeker.seekBy(sufficientPrefix.length - prefix.length) // Would be required if we’d skip the processed part.
+ } else if (sufficientSuffix !== undefined) {
suffix = sufficientSuffix;
- else
+ } else {
throw new Error('Target cannot be disambiguated; how could that have happened‽');
+ }
}
}
diff --git a/packages/dom/src/text-quote/match.ts b/packages/dom/src/text-quote/match.ts
index 112e054..73cd1d7 100644
--- a/packages/dom/src/text-quote/match.ts
+++ b/packages/dom/src/text-quote/match.ts
@@ -53,41 +53,62 @@ export function abstractTextQuoteSelectorMatcher(
const suffix = selector.suffix || '';
const searchPattern = prefix + exact + suffix;
- let partialMatches: Array<{
- startChunk: TChunk;
- startIndex: number;
+ // The code below runs a loop with three steps:
+ // 1. Continue checking any partial matches from the previous chunk(s).
+ // 2. Try find the whole pattern in the chunk (possibly multiple times).
+ // 3. Check if this chunk ends with a partial match (or even multiple partial matches).
+
+ interface PartialMatch {
+ startChunk?: TChunk;
+ startIndex?: number;
+ endChunk?: TChunk;
+ endIndex?: number;
charactersMatched: number;
- }> = [];
+ }
+ let partialMatches: PartialMatch[] = [];
let chunk: TChunk | null;
while (chunk = textChunks.currentChunk) {
const chunkValue = chunk.data;
- // Continue checking any partial matches from the previous chunk(s).
+ // 1. Continue checking any partial matches from the previous chunk(s).
const remainingPartialMatches: typeof partialMatches = [];
- for (const { startChunk, startIndex, charactersMatched } of partialMatches) {
- if (searchPattern.length - charactersMatched > chunkValue.length) {
- if (chunkValue === searchPattern.substring(charactersMatched, charactersMatched + chunkValue.length)) {
- // The chunk is too short to complete the match; comparison has to be completed in subsequent chunks.
- remainingPartialMatches.push({
- startChunk,
- startIndex,
- charactersMatched: charactersMatched + chunkValue.length,
- });
+ for (const partialMatch of partialMatches) {
+ const charactersMatched = partialMatch.charactersMatched;
+
+ // If the current chunk contains the start and/or end of the match, record these.
+ if (partialMatch.endChunk === undefined) {
+ const charactersUntilMatchEnd = prefix.length + exact.length - charactersMatched;
+ if (charactersUntilMatchEnd <= chunkValue.length) {
+ partialMatch.endChunk = chunk;
+ partialMatch.endIndex = charactersUntilMatchEnd;
}
}
- else if (chunkValue.startsWith(searchPattern.substring(charactersMatched))) {
- yield {
- startChunk,
- startIndex,
- endChunk: chunk,
- endIndex: searchPattern.length - charactersMatched,
- };
+ if (partialMatch.startChunk === undefined) {
+ const charactersUntilMatchStart = prefix.length - charactersMatched;
+ if (
+ charactersUntilMatchStart < chunkValue.length
+ || partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk.
+ ) {
+ partialMatch.startChunk = chunk;
+ partialMatch.startIndex = charactersUntilMatchStart;
+ }
+ }
+
+ const charactersUntilSuffixEnd = searchPattern.length - charactersMatched;
+ if (charactersUntilSuffixEnd <= chunkValue.length) {
+ if (chunkValue.startsWith(searchPattern.substring(charactersMatched))) {
+ yield partialMatch as ChunkRange<TChunk>; // all fields are certainly defined now.
+ }
+ } else if (chunkValue === searchPattern.substring(charactersMatched, charactersMatched + chunkValue.length)) {
+ // The chunk is too short to complete the match; comparison has to be completed in subsequent chunks.
+ partialMatch.charactersMatched += chunkValue.length;
+ remainingPartialMatches.push(partialMatch);
}
}
partialMatches = remainingPartialMatches;
- // Try find the whole pattern in the chunk (possibly multiple times).
+ // 2. Try find the whole pattern in the chunk (possibly multiple times).
if (searchPattern.length <= chunkValue.length) {
let fromIndex = 0;
while (fromIndex <= chunkValue.length) {
@@ -106,11 +127,11 @@ export function abstractTextQuoteSelectorMatcher(
};
// Advance the search forward to detect multiple occurrences within the same chunk.
- fromIndex = matchStartIndex + 1;
+ fromIndex = patternStartIndex + 1;
}
}
- // Check if this chunk ends with a partial match (or even multiple partial matches).
+ // 3. Check if this chunk ends with a partial match (or even multiple partial matches).
let newPartialMatches: number[] = [];
const searchStartPoint = Math.max(chunkValue.length - searchPattern.length + 1, 0);
for (let i = searchStartPoint; i < chunkValue.length; i++) {
@@ -121,11 +142,22 @@ export function abstractTextQuoteSelectorMatcher(
if (character === searchPattern[0]) newPartialMatches.push(i);
}
for (const partialMatchStartIndex of newPartialMatches) {
- partialMatches.push({
- startChunk: chunk,
- startIndex: partialMatchStartIndex,
- charactersMatched: chunkValue.length - partialMatchStartIndex,
- });
+ const charactersMatched = chunkValue.length - partialMatchStartIndex;
+ const partialMatch: PartialMatch = {
+ charactersMatched,
+ };
+ if (charactersMatched >= prefix.length + exact.length) {
+ partialMatch.endChunk = chunk;
+ partialMatch.endIndex = partialMatchStartIndex + prefix.length + exact.length;
+ }
+ if (
+ charactersMatched > prefix.length
+ || partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk.
+ ) {
+ partialMatch.startChunk = chunk;
+ partialMatch.startIndex = partialMatchStartIndex + prefix.length;
+ }
+ partialMatches.push(partialMatch);
}
if (textChunks.nextChunk() === null)