From e61a4fb226505e3c87219acd17c8a620118931f1 Mon Sep 17 00:00:00 2001 From: dcode Date: Mon, 15 Nov 2021 23:41:19 +0100 Subject: [PATCH 1/4] Refactor text util surrogate helpers --- src/module.ts | 16 +++++++++------- src/tokenizer.ts | 21 +++++++++------------ src/util/text.ts | 44 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 19 deletions(-) diff --git a/src/module.ts b/src/module.ts index e4f871de7d..694820d905 100644 --- a/src/module.ts +++ b/src/module.ts @@ -10,6 +10,7 @@ import { BuiltinNames } from "./builtins"; import { Target } from "./common"; +import { isHighSurrogate, isLowSurrogate, surrogatesToCodepoint, SURROGATE_HIGH, SURROGATE_LOW } from "./util"; import * as binaryen from "./glue/binaryen"; /** A Binaryen-compatible index. */ @@ -3108,8 +3109,8 @@ function stringLengthUTF8(str: string): usize { } else if (c1 <= 0x7FF) { len += 2; } else if ( - (c1 & 0xFC00) === 0xD800 && i + 1 < k && - (str.charCodeAt(i + 1) & 0xFC00) === 0xDC00 + isHighSurrogate(c1) && i + 1 < k && + isLowSurrogate(str.charCodeAt(i + 1)) ) { i++; len += 4; @@ -3146,10 +3147,10 @@ function allocString(str: string | null): usize { binaryen.__i32_store8(idx++, (0xC0 | (c1 >>> 6) ) as u8); binaryen.__i32_store8(idx++, (0x80 | ( c1 & 63)) as u8); } else if ( - (c1 & 0xFC00) === 0xD800 && i + 1 < k && - ((c2 = str.charCodeAt(i + 1)) & 0xFC00) === 0xDC00 + isHighSurrogate(c1) && i + 1 < k && + isLowSurrogate(c2 = str.charCodeAt(i + 1)) ) { - c1 = 0x10000 + ((c1 & 0x3FF) << 10) | (c2 & 0x3FF); + c1 = surrogatesToCodepoint(c1, c2); ++i; binaryen.__i32_store8(idx++, (0xF0 | (c1 >>> 18) ) as u8); binaryen.__i32_store8(idx++, (0x80 | ((c1 >>> 12) & 63)) as u8); @@ -3209,10 +3210,11 @@ export function readString(ptr: usize): string | null { arr.push(cp); } else { let ch = cp - 0x10000; - arr.push(0xD800 | (ch >>> 10)); - arr.push(0xDC00 | (ch & 0x3FF)); + arr.push(SURROGATE_HIGH | (ch >>> 10)); + arr.push(SURROGATE_LOW | (ch & 0x3FF)); } } + // TODO: implement and use String.fromCodePoints return String.fromCharCodes(arr); } diff --git a/src/tokenizer.ts b/src/tokenizer.ts index 3ca4f1cd71..fab060fd5c 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer.ts @@ -29,7 +29,9 @@ import { isIdentifierStart, isIdentifierPart, isDecimal, - isOctal + isOctal, + isHighSurrogate, + isLowSurrogate } from "./util"; /** Named token types. */ @@ -976,9 +978,9 @@ export class Tokenizer extends DiagnosticEmitter { break; } let start = pos++; - if ( // surrogate pair? - (c & 0xFC00) == 0xD800 && pos < end && - ((text.charCodeAt(pos)) & 0xFC00) == 0xDC00 + if ( + isHighSurrogate(c) && pos < end && + isLowSurrogate(text.charCodeAt(pos)) ) ++pos; this.error( DiagnosticCode.Invalid_character, @@ -1216,7 +1218,7 @@ export class Tokenizer extends DiagnosticEmitter { case CharCode.LINEFEED: case CharCode.LINESEPARATOR: case CharCode.PARAGRAPHSEPARATOR: return ""; - default: return String.fromCharCode(c); + default: return String.fromCodePoint(c); } } @@ -1677,7 +1679,7 @@ export class Tokenizer extends DiagnosticEmitter { return ""; } this.pos = pos; - return String.fromCharCode(value); + return String.fromCodePoint(value); } checkForIdentifierStartAfterNumericLiteral(): void { @@ -1739,12 +1741,7 @@ export class Tokenizer extends DiagnosticEmitter { ? text.substring(startIfTaggedTemplate, this.pos) : ""; } - return value32 < 0x10000 - ? String.fromCharCode(value32) - : String.fromCharCode( - ((value32 - 0x10000) >>> 10) | 0xD800, - ((value32 - 0x10000) & 1023) | 0xDC00 - ); + return String.fromCodePoint(value32); } } diff --git a/src/util/text.ts b/src/util/text.ts index 844bec41b2..e7752bbef2 100644 --- a/src/util/text.ts +++ b/src/util/text.ts @@ -175,6 +175,50 @@ export function isWhiteSpace(c: i32): bool { } } +/** First high surrogate. */ +export const SURROGATE_HIGH = 0xD800; + +/** First low surrogate. */ +export const SURROGATE_LOW = 0xDC00; + +/** Tests if a code unit or code point is a surrogate. */ +export function isSurrogate(c: i32): bool { + // F800: 11111 0 0000000000 Mask + // D800: 11011 X XXXXXXXXXX Any surrogate + return (c & 0xF800) == SURROGATE_HIGH; +} + +/** Tests if a surrogate is a high (lead) surrogate. */ +export function isSurrogateHigh(c: i32): bool { + // D800-DBFF + return c < SURROGATE_LOW; +} + +/** Tests if a surrogate is a low (trail) surrogate. */ +export function isSurrogateLow(c: i32): bool { + // DC00-DFFF + return c >= SURROGATE_LOW; +} + +/** Tests if a code unit or code point is a high (lead) surrogate. */ +export function isHighSurrogate(c: i32): bool { + // FC00: 11111 1 0000000000 Mask + // D800: 11011 0 XXXXXXXXXX High/Lead surrogate + return (c & 0xFC00) == SURROGATE_HIGH; +} + +/** Tests if a code unit or code point is a low (trail) surrogate. */ +export function isLowSurrogate(c: i32): bool { + // FC00: 11111 1 0000000000 Mask + // DC00: 11011 1 XXXXXXXXXX Low/Trail surrogate + return (c & 0xFC00) == SURROGATE_LOW; +} + +/** Converts a surrogate pair to its respective code point. */ +export function surrogatesToCodepoint(hi: i32, lo: i32): i32 { + return 0x10000 + ((hi & 0x3FF) << 10) | (lo & 0x3FF); +} + export function isAlpha(c: i32): bool { let c0 = c | 32; // unify uppercases and lowercases a|A - z|Z return c0 >= CharCode.a && c0 <= CharCode.z; From 8a8132af41153558130eec54c8fe84dfaded1798 Mon Sep 17 00:00:00 2001 From: dcode Date: Mon, 15 Nov 2021 23:49:36 +0100 Subject: [PATCH 2/4] -> combineSurrogates --- src/module.ts | 4 ++-- src/util/text.ts | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/module.ts b/src/module.ts index 694820d905..bee3d44cda 100644 --- a/src/module.ts +++ b/src/module.ts @@ -10,7 +10,7 @@ import { BuiltinNames } from "./builtins"; import { Target } from "./common"; -import { isHighSurrogate, isLowSurrogate, surrogatesToCodepoint, SURROGATE_HIGH, SURROGATE_LOW } from "./util"; +import { isHighSurrogate, isLowSurrogate, combineSurrogates, SURROGATE_HIGH, SURROGATE_LOW } from "./util"; import * as binaryen from "./glue/binaryen"; /** A Binaryen-compatible index. */ @@ -3150,7 +3150,7 @@ function allocString(str: string | null): usize { isHighSurrogate(c1) && i + 1 < k && isLowSurrogate(c2 = str.charCodeAt(i + 1)) ) { - c1 = surrogatesToCodepoint(c1, c2); + c1 = combineSurrogates(c1, c2); ++i; binaryen.__i32_store8(idx++, (0xF0 | (c1 >>> 18) ) as u8); binaryen.__i32_store8(idx++, (0x80 | ((c1 >>> 12) & 63)) as u8); diff --git a/src/util/text.ts b/src/util/text.ts index e7752bbef2..33105ae33b 100644 --- a/src/util/text.ts +++ b/src/util/text.ts @@ -215,7 +215,7 @@ export function isLowSurrogate(c: i32): bool { } /** Converts a surrogate pair to its respective code point. */ -export function surrogatesToCodepoint(hi: i32, lo: i32): i32 { +export function combineSurrogates(hi: i32, lo: i32): i32 { return 0x10000 + ((hi & 0x3FF) << 10) | (lo & 0x3FF); } From c5b75a5dc715a865c00e904201d7bf74c8f2184c Mon Sep 17 00:00:00 2001 From: dcode Date: Mon, 15 Nov 2021 23:52:17 +0100 Subject: [PATCH 3/4] comment --- src/util/text.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/util/text.ts b/src/util/text.ts index 33105ae33b..6b3598e061 100644 --- a/src/util/text.ts +++ b/src/util/text.ts @@ -175,10 +175,10 @@ export function isWhiteSpace(c: i32): bool { } } -/** First high surrogate. */ +/** First high (lead) surrogate. */ export const SURROGATE_HIGH = 0xD800; -/** First low surrogate. */ +/** First low (trail) surrogate. */ export const SURROGATE_LOW = 0xDC00; /** Tests if a code unit or code point is a surrogate. */ From caea921c070f6f1860795f5f63b79f65525d3710 Mon Sep 17 00:00:00 2001 From: dcode Date: Tue, 16 Nov 2021 00:34:13 +0100 Subject: [PATCH 4/4] Update src/module.ts Co-authored-by: Max Graey --- src/module.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/module.ts b/src/module.ts index bee3d44cda..791f49bd60 100644 --- a/src/module.ts +++ b/src/module.ts @@ -10,7 +10,13 @@ import { BuiltinNames } from "./builtins"; import { Target } from "./common"; -import { isHighSurrogate, isLowSurrogate, combineSurrogates, SURROGATE_HIGH, SURROGATE_LOW } from "./util"; +import { + isHighSurrogate, + isLowSurrogate, + combineSurrogates, + SURROGATE_HIGH, + SURROGATE_LOW +} from "./util"; import * as binaryen from "./glue/binaryen"; /** A Binaryen-compatible index. */