You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@annotator.apache.org by ge...@apache.org on 2020/10/17 14:42:58 UTC

[incubator-annotator] 02/02: Handle code points that cross chunks

This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch import-dom-seek
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git

commit 79dd711f45a3ffc7f6db003d23b02c68777c0bac
Author: Gerben <ge...@treora.com>
AuthorDate: Sat Oct 17 16:41:11 2020 +0200

    Handle code points that cross chunks
    
    Not actually needed in our scenario, but fun to implement correctly.
---
 packages/dom/src/seek.ts | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/packages/dom/src/seek.ts b/packages/dom/src/seek.ts
index fc2af9a..732ab8e 100644
--- a/packages/dom/src/seek.ts
+++ b/packages/dom/src/seek.ts
@@ -202,22 +202,40 @@ class _CharSeeker implements Seeker<string[]> {
     let result: string[] = [];
 
     if (this.position < target) {
+      let unpairedSurrogate = '';
       while (this.position < target) {
-        characters = [...this.raw.read(1, true)];
+        let s = unpairedSurrogate + this.raw.read(1, true);
+        if (endsWithinCharacter(s)) {
+          unpairedSurrogate = s.slice(-1); // consider this half-character part of the next string.
+          s = s.slice(0,-1);
+        } else {
+          unpairedSurrogate = '';
+        }
+        characters = [...s];
         this.position += characters.length;
         if (read) result = result.concat(characters);
       }
+      if (unpairedSurrogate) this.raw.seekBy(-1); // align with the last complete character.
       if (!roundUp) {
         const overshootInCodePoints = this.position - target;
         const overshootInCodeUnits = characters.slice(overshootInCodePoints).join('').length;
         this.raw.seekBy(-overshootInCodeUnits);
       }
     } else {
+      let unpairedSurrogate = '';
       while (this.position > target) {
-        characters = [...this.raw.read(-1, true)];
+        let s = this.raw.read(-1, true) + unpairedSurrogate;
+        if (startsWithinCharacter(s)) {
+          unpairedSurrogate = s[0];
+          s = s.slice(1);
+        } else {
+          unpairedSurrogate = '';
+        }
+        characters = [...s];
         this.position -= characters.length;
         if (read) result = characters.concat(result);
       }
+      if (unpairedSurrogate) this.raw.seekBy(1);
       if (!roundUp) {
         const overshootInCodePoints = target - this.position;
         const overshootInCodeUnits = characters.slice(0, overshootInCodePoints).join('').length;
@@ -244,3 +262,13 @@ export class CharSeeker extends _CharSeeker implements Seeker<string[]>, Boundar
 function isText(node: Node): node is Text {
   return node.nodeType === Node.TEXT_NODE;
 }
+
+function endsWithinCharacter(s: string) {
+  const codeUnit = s.charCodeAt(s.length - 1);
+  return (0xD800 <= codeUnit && codeUnit <= 0xDBFF)
+}
+
+function startsWithinCharacter(s: string) {
+  const codeUnit = s.charCodeAt(0);
+  return (0xDC00 <= codeUnit && codeUnit <= 0xDFFF)
+}