You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@annotator.apache.org by ge...@apache.org on 2020/09/16 13:57:14 UTC
[incubator-annotator] branch improve-range-stuff updated: Support
matching across chunks
This is an automated email from the ASF dual-hosted git repository.
gerben pushed a commit to branch improve-range-stuff
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git
The following commit(s) were added to refs/heads/improve-range-stuff by this push:
new 4e1d317 Support matching across chunks
4e1d317 is described below
commit 4e1d317dcef8ba89de53267961ed2f286da1cd6f
Author: Gerben <ge...@treora.com>
AuthorDate: Wed Sep 16 15:03:00 2020 +0200
Support matching across chunks
---
packages/dom/src/text-quote/match.ts | 83 +++++++++++++++++++++++-------
packages/dom/test/text-quote/match.test.ts | 8 +--
2 files changed, 69 insertions(+), 22 deletions(-)
diff --git a/packages/dom/src/text-quote/match.ts b/packages/dom/src/text-quote/match.ts
index 6b7fd93..c3fb43d 100644
--- a/packages/dom/src/text-quote/match.ts
+++ b/packages/dom/src/text-quote/match.ts
@@ -54,36 +54,83 @@ interface AbstractRange<TChunk> {
export function abstractTextQuoteSelectorMatcher(
selector: TextQuoteSelector,
): <TChunk extends Chunk>(textChunks: AsyncIterable<TChunk>) => AsyncGenerator<AbstractRange<TChunk>, void, void> {
- return async function* matchAll(textChunks) {
+ return async function* matchAll<TChunk extends Chunk>(textChunks: AsyncIterable<TChunk>) {
const exact = selector.exact;
const prefix = selector.prefix || '';
const suffix = selector.suffix || '';
const searchPattern = prefix + exact + suffix;
+ let partialMatches: Array<{
+ startChunk: TChunk;
+ startIndex: number;
+ charactersMatched: number;
+ }> = [];
+
for await (const chunk of textChunks) {
const chunkValue = chunk.toString();
- // Find the pattern in the chunk (possibly multiple times)
- // TODO allow pattern to be spread across chunks
- let fromIndex = 0;
- while (fromIndex <= chunkValue.length) {
- const patternStartIndex = chunkValue.indexOf(searchPattern, fromIndex);
- if (patternStartIndex === -1) break;
+ // Continue checking any partial matches from the previous chunk(s).
+ const remainingPartialMatches: typeof partialMatches = [];
+ for (const { startChunk, startIndex, charactersMatched } of partialMatches) {
+ if (searchPattern.length - charactersMatched > chunkValue.length) {
+ if (chunkValue === searchPattern.substring(charactersMatched, charactersMatched + chunkValue.length)) {
+ // The chunk is too short to complete the match; comparison has to be completed in subsequent chunks.
+ remainingPartialMatches.push({
+ startChunk,
+ startIndex,
+ charactersMatched: charactersMatched + chunkValue.length,
+ });
+ }
+ }
+ else if (chunkValue.startsWith(searchPattern.substring(charactersMatched))) {
+ yield {
+ startChunk,
+ startIndex,
+ endChunk: chunk,
+ endIndex: searchPattern.length - charactersMatched,
+ };
+ }
+ }
+ partialMatches = remainingPartialMatches;
+
+ // Try find the whole pattern in the chunk (possibly multiple times).
+ if (searchPattern.length <= chunkValue.length) {
+ let fromIndex = 0;
+ while (fromIndex <= chunkValue.length) {
+ const patternStartIndex = chunkValue.indexOf(searchPattern, fromIndex);
+ if (patternStartIndex === -1) break;
- // Correct for the prefix and suffix lengths.
- const matchStartIndex = patternStartIndex + prefix.length;
- const matchEndIndex = matchStartIndex + exact.length;
+ // Correct for the prefix and suffix lengths.
+ const matchStartIndex = patternStartIndex + prefix.length;
+ const matchEndIndex = matchStartIndex + exact.length;
- yield {
- startChunk: chunk,
- startIndex: matchStartIndex,
- endChunk: chunk,
- endIndex: matchEndIndex,
- };
+ yield {
+ startChunk: chunk,
+ startIndex: matchStartIndex,
+ endChunk: chunk,
+ endIndex: matchEndIndex,
+ };
+
+ // Advance the search forward to detect multiple occurrences within the same chunk.
+ fromIndex = matchStartIndex + 1;
+ }
+ }
- // Advance the search forward to detect multiple occurrences within the same chunk.
- fromIndex = matchStartIndex + 1;
+ // Check if this chunk ends with a partial match (or even multiple partial matches).
+ let newPartialMatches: number[] = [];
+ const searchStartPoint = Math.max(chunkValue.length - searchPattern.length + 1, 0);
+ for (let i = searchStartPoint; i < chunkValue.length; i++) {
+ const character = chunkValue[i];
+ newPartialMatches = newPartialMatches.filter(
+ partialMatchStartIndex => (character === searchPattern[i - partialMatchStartIndex])
+ );
+ if (character === searchPattern[0]) newPartialMatches.push(i);
}
+ newPartialMatches.forEach(partialMatchStartIndex => partialMatches.push({
+ startChunk: chunk,
+ startIndex: partialMatchStartIndex,
+ charactersMatched: chunkValue.length - partialMatchStartIndex,
+ }));
}
};
}
diff --git a/packages/dom/test/text-quote/match.test.ts b/packages/dom/test/text-quote/match.test.ts
index 7d52e15..fe8d36b 100644
--- a/packages/dom/test/text-quote/match.test.ts
+++ b/packages/dom/test/text-quote/match.test.ts
@@ -60,8 +60,8 @@ describe('createTextQuoteSelectorMatcher', () => {
{
startContainerXPath: '//b/text()[13]',
startOffset: 0,
- endContainerXPath: '//b/text()[21]',
- endOffset: 0,
+ endContainerXPath: '//b/text()[20]',
+ endOffset: 1,
},
]);
});
@@ -89,8 +89,8 @@ describe('createTextQuoteSelectorMatcher', () => {
{
startContainerXPath: '//b/text()[4]', // "dolor"
startOffset: 0,
- endContainerXPath: '//b/text()[8]', // "et yada yada"
- endOffset: 0,
+ endContainerXPath: '//b/text()[6]', // " am"
+ endOffset: 3,
},
]);
});