diff --git a/src/module.ts b/src/module.ts index e4f871de7d..791f49bd60 100644 --- a/src/module.ts +++ b/src/module.ts @@ -10,6 +10,13 @@ import { BuiltinNames } from "./builtins"; import { Target } from "./common"; +import { + isHighSurrogate, + isLowSurrogate, + combineSurrogates, + SURROGATE_HIGH, + SURROGATE_LOW +} from "./util"; import * as binaryen from "./glue/binaryen"; /** A Binaryen-compatible index. */ @@ -3108,8 +3115,8 @@ function stringLengthUTF8(str: string): usize { } else if (c1 <= 0x7FF) { len += 2; } else if ( - (c1 & 0xFC00) === 0xD800 && i + 1 < k && - (str.charCodeAt(i + 1) & 0xFC00) === 0xDC00 + isHighSurrogate(c1) && i + 1 < k && + isLowSurrogate(str.charCodeAt(i + 1)) ) { i++; len += 4; @@ -3146,10 +3153,10 @@ function allocString(str: string | null): usize { binaryen.__i32_store8(idx++, (0xC0 | (c1 >>> 6) ) as u8); binaryen.__i32_store8(idx++, (0x80 | ( c1 & 63)) as u8); } else if ( - (c1 & 0xFC00) === 0xD800 && i + 1 < k && - ((c2 = str.charCodeAt(i + 1)) & 0xFC00) === 0xDC00 + isHighSurrogate(c1) && i + 1 < k && + isLowSurrogate(c2 = str.charCodeAt(i + 1)) ) { - c1 = 0x10000 + ((c1 & 0x3FF) << 10) | (c2 & 0x3FF); + c1 = combineSurrogates(c1, c2); ++i; binaryen.__i32_store8(idx++, (0xF0 | (c1 >>> 18) ) as u8); binaryen.__i32_store8(idx++, (0x80 | ((c1 >>> 12) & 63)) as u8); @@ -3209,10 +3216,11 @@ export function readString(ptr: usize): string | null { arr.push(cp); } else { let ch = cp - 0x10000; - arr.push(0xD800 | (ch >>> 10)); - arr.push(0xDC00 | (ch & 0x3FF)); + arr.push(SURROGATE_HIGH | (ch >>> 10)); + arr.push(SURROGATE_LOW | (ch & 0x3FF)); } } + // TODO: implement and use String.fromCodePoints return String.fromCharCodes(arr); } diff --git a/src/tokenizer.ts b/src/tokenizer.ts index 3ca4f1cd71..fab060fd5c 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer.ts @@ -29,7 +29,9 @@ import { isIdentifierStart, isIdentifierPart, isDecimal, - isOctal + isOctal, + isHighSurrogate, + isLowSurrogate } from "./util"; /** Named token types. */ @@ -976,9 +978,9 @@ export class Tokenizer extends DiagnosticEmitter { break; } let start = pos++; - if ( // surrogate pair? - (c & 0xFC00) == 0xD800 && pos < end && - ((text.charCodeAt(pos)) & 0xFC00) == 0xDC00 + if ( + isHighSurrogate(c) && pos < end && + isLowSurrogate(text.charCodeAt(pos)) ) ++pos; this.error( DiagnosticCode.Invalid_character, @@ -1216,7 +1218,7 @@ export class Tokenizer extends DiagnosticEmitter { case CharCode.LINEFEED: case CharCode.LINESEPARATOR: case CharCode.PARAGRAPHSEPARATOR: return ""; - default: return String.fromCharCode(c); + default: return String.fromCodePoint(c); } } @@ -1677,7 +1679,7 @@ export class Tokenizer extends DiagnosticEmitter { return ""; } this.pos = pos; - return String.fromCharCode(value); + return String.fromCodePoint(value); } checkForIdentifierStartAfterNumericLiteral(): void { @@ -1739,12 +1741,7 @@ export class Tokenizer extends DiagnosticEmitter { ? text.substring(startIfTaggedTemplate, this.pos) : ""; } - return value32 < 0x10000 - ? String.fromCharCode(value32) - : String.fromCharCode( - ((value32 - 0x10000) >>> 10) | 0xD800, - ((value32 - 0x10000) & 1023) | 0xDC00 - ); + return String.fromCodePoint(value32); } } diff --git a/src/util/text.ts b/src/util/text.ts index 844bec41b2..6b3598e061 100644 --- a/src/util/text.ts +++ b/src/util/text.ts @@ -175,6 +175,50 @@ export function isWhiteSpace(c: i32): bool { } } +/** First high (lead) surrogate. */ +export const SURROGATE_HIGH = 0xD800; + +/** First low (trail) surrogate. */ +export const SURROGATE_LOW = 0xDC00; + +/** Tests if a code unit or code point is a surrogate. */ +export function isSurrogate(c: i32): bool { + // F800: 11111 0 0000000000 Mask + // D800: 11011 X XXXXXXXXXX Any surrogate + return (c & 0xF800) == SURROGATE_HIGH; +} + +/** Tests if a surrogate is a high (lead) surrogate. */ +export function isSurrogateHigh(c: i32): bool { + // D800-DBFF + return c < SURROGATE_LOW; +} + +/** Tests if a surrogate is a low (trail) surrogate. */ +export function isSurrogateLow(c: i32): bool { + // DC00-DFFF + return c >= SURROGATE_LOW; +} + +/** Tests if a code unit or code point is a high (lead) surrogate. */ +export function isHighSurrogate(c: i32): bool { + // FC00: 11111 1 0000000000 Mask + // D800: 11011 0 XXXXXXXXXX High/Lead surrogate + return (c & 0xFC00) == SURROGATE_HIGH; +} + +/** Tests if a code unit or code point is a low (trail) surrogate. */ +export function isLowSurrogate(c: i32): bool { + // FC00: 11111 1 0000000000 Mask + // DC00: 11011 1 XXXXXXXXXX Low/Trail surrogate + return (c & 0xFC00) == SURROGATE_LOW; +} + +/** Converts a surrogate pair to its respective code point. */ +export function combineSurrogates(hi: i32, lo: i32): i32 { + return 0x10000 + ((hi & 0x3FF) << 10) | (lo & 0x3FF); +} + export function isAlpha(c: i32): bool { let c0 = c | 32; // unify uppercases and lowercases a|A - z|Z return c0 >= CharCode.a && c0 <= CharCode.z;