You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@annotator.apache.org by ge...@apache.org on 2020/10/17 14:42:58 UTC
[incubator-annotator] 02/02: Handle code points that cross chunks
This is an automated email from the ASF dual-hosted git repository.
gerben pushed a commit to branch import-dom-seek
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git
commit 79dd711f45a3ffc7f6db003d23b02c68777c0bac
Author: Gerben <ge...@treora.com>
AuthorDate: Sat Oct 17 16:41:11 2020 +0200
Handle code points that cross chunks
Not actually needed in our scenario, but fun to implement correctly.
---
packages/dom/src/seek.ts | 32 ++++++++++++++++++++++++++++++--
1 file changed, 30 insertions(+), 2 deletions(-)
diff --git a/packages/dom/src/seek.ts b/packages/dom/src/seek.ts
index fc2af9a..732ab8e 100644
--- a/packages/dom/src/seek.ts
+++ b/packages/dom/src/seek.ts
@@ -202,22 +202,40 @@ class _CharSeeker implements Seeker<string[]> {
let result: string[] = [];
if (this.position < target) {
+ let unpairedSurrogate = '';
while (this.position < target) {
- characters = [...this.raw.read(1, true)];
+ let s = unpairedSurrogate + this.raw.read(1, true);
+ if (endsWithinCharacter(s)) {
+ unpairedSurrogate = s.slice(-1); // consider this half-character part of the next string.
+ s = s.slice(0,-1);
+ } else {
+ unpairedSurrogate = '';
+ }
+ characters = [...s];
this.position += characters.length;
if (read) result = result.concat(characters);
}
+ if (unpairedSurrogate) this.raw.seekBy(-1); // align with the last complete character.
if (!roundUp) {
const overshootInCodePoints = this.position - target;
const overshootInCodeUnits = characters.slice(overshootInCodePoints).join('').length;
this.raw.seekBy(-overshootInCodeUnits);
}
} else {
+ let unpairedSurrogate = '';
while (this.position > target) {
- characters = [...this.raw.read(-1, true)];
+ let s = this.raw.read(-1, true) + unpairedSurrogate;
+ if (startsWithinCharacter(s)) {
+ unpairedSurrogate = s[0];
+ s = s.slice(1);
+ } else {
+ unpairedSurrogate = '';
+ }
+ characters = [...s];
this.position -= characters.length;
if (read) result = characters.concat(result);
}
+ if (unpairedSurrogate) this.raw.seekBy(1);
if (!roundUp) {
const overshootInCodePoints = target - this.position;
const overshootInCodeUnits = characters.slice(0, overshootInCodePoints).join('').length;
@@ -244,3 +262,13 @@ export class CharSeeker extends _CharSeeker implements Seeker<string[]>, Boundar
function isText(node: Node): node is Text {
return node.nodeType === Node.TEXT_NODE;
}
+
+function endsWithinCharacter(s: string) {
+ const codeUnit = s.charCodeAt(s.length - 1);
+ return (0xD800 <= codeUnit && codeUnit <= 0xDBFF)
+}
+
+function startsWithinCharacter(s: string) {
+ const codeUnit = s.charCodeAt(0);
+ return (0xDC00 <= codeUnit && codeUnit <= 0xDFFF)
+}