From 8fc6cdac6f137a87d804b0ec74449e30c5401ab8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Jastrze=CC=A8bski?= Date: Thu, 25 Apr 2024 15:27:00 +0200 Subject: [PATCH 01/13] feat: unicode flag and construct --- src/builders.ts | 1 + src/constructs/__tests__/unicode.test.tsx | 136 ++++++++++++++++++++++ src/constructs/unicode.ts | 52 +++++++++ src/index.ts | 1 + src/types.ts | 13 ++- 5 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 src/constructs/__tests__/unicode.test.tsx create mode 100644 src/constructs/unicode.ts diff --git a/src/builders.ts b/src/builders.ts index 5568761..6f29acd 100644 --- a/src/builders.ts +++ b/src/builders.ts @@ -32,6 +32,7 @@ function encodeFlags(flags: RegexFlags): string { if (flags.hasIndices) result += 'd'; if (flags.dotAll) result += 's'; if (flags.sticky) result += 'y'; + if (flags.unicode) result += 'u'; return result; } diff --git a/src/constructs/__tests__/unicode.test.tsx b/src/constructs/__tests__/unicode.test.tsx new file mode 100644 index 0000000..86e005d --- /dev/null +++ b/src/constructs/__tests__/unicode.test.tsx @@ -0,0 +1,136 @@ +import { + buildRegExp, + charClass, + endOfString, + type RegexSequence, + startOfString, + unicodeChar, + unicodeProp, +} from '../../index'; + +function u(sequence: RegexSequence) { + return buildRegExp(sequence, { unicode: true }); +} + +test('`unicodeChar` pattern', () => { + // eslint-disable-next-line no-control-regex + expect(unicodeChar(0)).toEqualRegex(/\u0000/); + // eslint-disable-next-line no-control-regex + expect(unicodeChar(0x1)).toEqualRegex(/\u0001/); + // eslint-disable-next-line no-control-regex + expect(unicodeChar(0x12)).toEqualRegex(/\u0012/); + expect(unicodeChar(0x123)).toEqualRegex(/\u0123/); + expect(unicodeChar(0x1234)).toEqualRegex(/\u1234/); + + // eslint-disable-next-line no-control-regex + expect(u(unicodeChar(0))).toEqualRegex(new RegExp('\\u0000', 'u')); + // eslint-disable-next-line no-control-regex + expect(u(unicodeChar(0x1))).toEqualRegex(new RegExp('\\u0001', 'u')); + expect(u(unicodeChar(0x12))).toEqualRegex( + // eslint-disable-next-line no-control-regex + new RegExp('\\u0012', 'u'), + ); + expect(unicodeChar(0x0123)).toEqualRegex(/\u0123/); + expect(unicodeChar(0x1234)).toEqualRegex(/\u1234/); + + expect(u(unicodeChar(0x0123))).toEqualRegex(/\u0123/u); + expect(u(unicodeChar(0x1234))).toEqualRegex(/\u1234/u); + expect(u(unicodeChar(0x12345))).toEqualRegex(new RegExp('\\u{12345}', 'u')); + expect(u(unicodeChar(0x103456))).toEqualRegex(new RegExp('\\u{103456}', 'u')); +}); + +test('`unicodeChar` matching', () => { + expect(unicodeChar(0)).toMatchString('\u{0}'); + expect(unicodeChar(0x1)).toMatchString('\u{1}'); + expect(unicodeChar(0x12)).toMatchString('\u{12}}'); + expect(unicodeChar(0x123)).toMatchString('\u{123}'); + expect(unicodeChar(0x1234)).toMatchString('\u{1234}}'); + expect(unicodeChar(0x12345)).not.toMatchString('\u{12345}'); + expect(unicodeChar(0x103456)).not.toMatchString('\u{103456}'); + + expect(unicodeChar('a'.codePointAt(0)!)).toMatchString('a'); + expect(unicodeChar('ą'.codePointAt(0)!)).toMatchString('ą'); + expect(unicodeChar('©'.codePointAt(0)!)).toMatchString('©'); + expect(unicodeChar('😎'.codePointAt(0)!)).not.toMatchString('😎'); + + expect(u(unicodeChar(0))).toMatchString('\u{0}'); + expect(u(unicodeChar(0))).not.toMatchString('a'); + expect(u(unicodeChar(0x1))).toMatchString('\u{1}'); + expect(u(unicodeChar(0x12))).toMatchString('\u{12}'); + expect(u(unicodeChar(0x123))).toMatchString('\u{123}'); + expect(u(unicodeChar(0x1234))).toMatchString('\u{1234}'); + expect(u(unicodeChar(0x12345))).toMatchString('\u{12345}'); + expect(u(unicodeChar(0x103456))).toMatchString('\u{103456}'); + + expect(u(unicodeChar('a'.codePointAt(0)!))).toMatchString('a'); + expect(u(unicodeChar('ą'.codePointAt(0)!))).toMatchString('ą'); + expect(u(unicodeChar('©'.codePointAt(0)!))).toMatchString('©'); + expect(u(unicodeChar('😎'.codePointAt(0)!))).toMatchString('😎'); + expect(u(unicodeChar('😎'.codePointAt(0)!))).toMatchString('\u{1f60e}'); +}); + +test('`unicodeChar` nesting matching', () => { + expect( + u(charClass(unicodeChar('a'.codePointAt(0)!), unicodeChar('ą'.codePointAt(0)!))), + ).toMatchString('a'); + expect( + u(charClass(unicodeChar('a'.codePointAt(0)!), unicodeChar('ą'.codePointAt(0)!))), + ).toMatchString('ą'); + expect( + u(charClass(unicodeChar('a'.codePointAt(0)!), unicodeChar('ą'.codePointAt(0)!))), + ).not.toMatchString('b'); +}); + +test('`unicodeProp` pattern', () => { + expect(unicodeProp('General_Category', 'Letter')).toEqualRegex(/\p{General_Category=Letter}/); + expect(unicodeProp('Letter')).toEqualRegex(/\p{Letter}/); + expect(unicodeProp('L')).toEqualRegex(/\p{L}/); + expect(unicodeProp('Lu')).toEqualRegex(/\p{Lu}/); + expect(unicodeProp('Ll')).toEqualRegex(/\p{Ll}/); + expect(unicodeProp('Lt')).toEqualRegex(/\p{Lt}/); + expect(unicodeProp('Lm')).toEqualRegex(/\p{Lm}/); + expect(unicodeProp('Lo')).toEqualRegex(/\p{Lo}/); + + expect(unicodeProp('Script', 'Latin')).toEqualRegex('\\p{Script=Latin}'); + expect(unicodeProp('Script', 'Grek')).toEqualRegex('\\p{Script=Grek}'); + expect(unicodeProp('sc', 'Cyrillic')).toEqualRegex('\\p{sc=Cyrillic}'); + + expect(unicodeProp('Script', 'Thaana')).toEqualRegex('\\p{Script=Thaana}'); + expect(unicodeProp('Script_Extensions', 'Thaana')).toEqualRegex('\\p{Script_Extensions=Thaana}'); + expect(unicodeProp('scx', 'Thaana')).toEqualRegex('\\p{scx=Thaana}'); + + expect(unicodeProp('Emoji')).toEqualRegex('\\p{Emoji}'); +}); + +test('`unicodeProp` matching', () => { + expect(u(unicodeProp('General_Category', 'Letter'))).toMatchString('A'); + expect(u(unicodeProp('Letter'))).toMatchString('A'); + expect(u(unicodeProp('L'))).toMatchString('A'); + + expect(u(unicodeProp('Uppercase'))).toMatchString('A'); + expect(u(unicodeProp('Uppercase'))).not.toMatchString('a'); + expect(u(unicodeProp('Lu'))).toMatchString('A'); + + expect(u(unicodeProp('Lowercase'))).toMatchString('a'); + expect(u(unicodeProp('Lowercase'))).not.toMatchString('A'); + expect(u(unicodeProp('Ll'))).toMatchString('a'); + + expect(u(unicodeProp('Script', 'Latin'))).toMatchString('A'); + expect(u(unicodeProp('Script', 'Latin'))).not.toMatchString('α'); + expect(u(unicodeProp('Script', 'Grek'))).toMatchString('α'); + expect(u(unicodeProp('Script', 'Grek'))).not.toMatchString('A'); + + // Basic emoji + expect(u([startOfString, unicodeProp('Emoji'), endOfString])).toMatchString('😎'); + expect(u([startOfString, unicodeProp('Emoji'), endOfString])).toMatchString('🐌'); + + // Complex emoji with skin tone modifier + expect(u(unicodeProp('Emoji'))).toMatchString('☝🏼'); + expect(u([startOfString, unicodeProp('Emoji'), endOfString])).not.toMatchString('☝🏼'); +}); + +test('`unicodeProp` nesting matching', () => { + expect(u(charClass(unicodeProp('Lowercase'), unicodeProp('White_Space')))).toMatchString('a'); + expect(u(charClass(unicodeProp('Lowercase'), unicodeProp('White_Space')))).toMatchString(' '); + expect(u(charClass(unicodeProp('Lowercase'), unicodeProp('White_Space')))).not.toMatchString('A'); +}); diff --git a/src/constructs/unicode.ts b/src/constructs/unicode.ts new file mode 100644 index 0000000..7d59241 --- /dev/null +++ b/src/constructs/unicode.ts @@ -0,0 +1,52 @@ +import { type CharacterClass, encodeCharacterClass } from './character-class'; + +/** + * Unicode character escape. + * + * Regex pattern: + * - `\uXXXX`: 4-digit hex escape for code points below 0x10000. + * - `\u{X}`: Unicode code point escape for code points above 0xFFFF. + * + * Note: for code points above 0xFFFF, the regex engine must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). + * + * @param codePoint The code point of the character to escape. + * @returns A character class representing the unicode escape. + */ +export function unicodeChar(codePoint: number): CharacterClass { + if (!Number.isInteger(codePoint)) { + throw new TypeError('Expected an integer code point but got: ' + codePoint); + } + + if (codePoint < 0) { + throw new RangeError('Code point must be a positive integer but got: ' + codePoint); + } + + let escape = + codePoint < 0x10000 + ? `\\u${codePoint.toString(16).padStart(4, '0')}` // 4-digit hex (works in all modes) + : `\\u{${codePoint.toString(16)}}`; // 1-6 digit hex (requires unicode-aware mode) + + return { + type: 'characterClass', + escape, + encode: encodeCharacterClass, + }; +} + +/** + * Unicode character class escape matching a set of characters specified by a Unicode property. + * + * Regex pattern: `\p{Property}` or `\p{Property=Value}` + * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape + * + * @param prop Unicode property name. + * @param value Unicode property value (optional). + * @returns A character class representing the unicode property escape. + */ +export function unicodeProp(prop: string, value?: string): CharacterClass { + return { + type: 'characterClass', + escape: `\\p{${prop}${value ? `=${value}` : ''}}`, + encode: encodeCharacterClass, + }; +} diff --git a/src/index.ts b/src/index.ts index d2c5791..9ca3523 100644 --- a/src/index.ts +++ b/src/index.ts @@ -37,3 +37,4 @@ export { negativeLookbehind } from './constructs/negative-lookbehind'; export { oneOrMore, optional, zeroOrMore } from './constructs/quantifiers'; export { regex } from './constructs/regex'; export { repeat } from './constructs/repeat'; +export { unicodeChar, unicodeProp } from './constructs/unicode'; diff --git a/src/types.ts b/src/types.ts index 2b102d5..5436499 100644 --- a/src/types.ts +++ b/src/types.ts @@ -79,8 +79,19 @@ export interface RegexFlags { dotAll?: boolean; /** - * MDN: _Matches only from the index indicated by the lastIndex property of this regular expression in the target string. Does not attempt to match from any later indexes._ + * MDN: _Matches only from the index indicated by the `lastIndex` property of this regular expression in the target string. Does not attempt to match from any later indexes._ * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/sticky */ sticky?: boolean; + + /** + * Enables [Unicode-aware mode](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). + * + * This enables features like: + * - unicode character escapes: `\u{xxxx}` + * - unicode character class escapes:`\p{UnicodePropertyValue}` + * + * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode + */ + unicode?: boolean; } From d87e494b68a75f28e2795ea3ba00ba7dafdb4561 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 6 May 2024 13:11:22 +0200 Subject: [PATCH 02/13] refactor: finish rebase --- src/constructs/unicode.ts | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/constructs/unicode.ts b/src/constructs/unicode.ts index 7d59241..de2aebd 100644 --- a/src/constructs/unicode.ts +++ b/src/constructs/unicode.ts @@ -1,4 +1,4 @@ -import { type CharacterClass, encodeCharacterClass } from './character-class'; +import type { CharacterEscape } from '../types'; /** * Unicode character escape. @@ -12,7 +12,7 @@ import { type CharacterClass, encodeCharacterClass } from './character-class'; * @param codePoint The code point of the character to escape. * @returns A character class representing the unicode escape. */ -export function unicodeChar(codePoint: number): CharacterClass { +export function unicodeChar(codePoint: number): CharacterEscape { if (!Number.isInteger(codePoint)) { throw new TypeError('Expected an integer code point but got: ' + codePoint); } @@ -27,9 +27,9 @@ export function unicodeChar(codePoint: number): CharacterClass { : `\\u{${codePoint.toString(16)}}`; // 1-6 digit hex (requires unicode-aware mode) return { - type: 'characterClass', - escape, - encode: encodeCharacterClass, + precedence: 'atom', + pattern: escape, + chars: [escape], }; } @@ -43,10 +43,12 @@ export function unicodeChar(codePoint: number): CharacterClass { * @param value Unicode property value (optional). * @returns A character class representing the unicode property escape. */ -export function unicodeProp(prop: string, value?: string): CharacterClass { +export function unicodeProp(prop: string, value?: string): CharacterEscape { + const escape = `\\p{${prop}${value ? `=${value}` : ''}}`; + return { - type: 'characterClass', - escape: `\\p{${prop}${value ? `=${value}` : ''}}`, - encode: encodeCharacterClass, + precedence: 'atom', + pattern: escape, + chars: [escape], }; } From ce9de895657224cbed7fa0f7c4b706b09fda6160 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Mon, 6 May 2024 14:13:26 +0200 Subject: [PATCH 03/13] docs: wip --- website/docs/api/unicode.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 website/docs/api/unicode.md diff --git a/website/docs/api/unicode.md b/website/docs/api/unicode.md new file mode 100644 index 0000000..d3a9a56 --- /dev/null +++ b/website/docs/api/unicode.md @@ -0,0 +1,33 @@ +--- +id: character-classes +title: Unicode +--- + +### Unicode-aware mode + +JavaScript `RegExp` object offers [Unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). + +### Unicode character escapes + +```ts +function unicodeChar(): CharacterEscape; +``` + +Regex syntax: + +- `\uXXXX`: 4-digit hex escape for code points below 0x10000. +- `\u{X}`: Unicode code point escape for code points above 0xFFFF. + +Note: for code points above 0xFFFF, the regex engine must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). + +### Unicode character class escapes + +```ts +function unicodeProp(prop: string, value?: string): CharacterEscape; +``` + +Unicode character class escape matching a set of characters specified by a Unicode property. + +Regex syntax: `\p{Property}` or `\p{Property=Value}` + +@see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape From 2816f324941da61d795984d5a1e5895d8260b2ba Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Tue, 7 May 2024 00:00:13 +0200 Subject: [PATCH 04/13] chore: tweaks --- src/constructs/__tests__/unicode.test.tsx | 17 +++++++++++++++++ src/constructs/unicode.ts | 8 ++------ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/constructs/__tests__/unicode.test.tsx b/src/constructs/__tests__/unicode.test.tsx index 86e005d..8bdae6b 100644 --- a/src/constructs/__tests__/unicode.test.tsx +++ b/src/constructs/__tests__/unicode.test.tsx @@ -81,6 +81,23 @@ test('`unicodeChar` nesting matching', () => { ).not.toMatchString('b'); }); +test('`unicodeChar` edge cases handling', () => { + expect(() => u(unicodeChar(NaN))).toThrowErrorMatchingInlineSnapshot( + `""unicodeChar": expected valid unicode code point but got: NaN"`, + ); + expect(() => u(unicodeChar(1.5))).toThrowErrorMatchingInlineSnapshot( + `""unicodeChar": expected valid unicode code point but got: 1.5"`, + ); + expect(() => u(unicodeChar(-1))).toThrowErrorMatchingInlineSnapshot( + `""unicodeChar": expected valid unicode code point but got: -1"`, + ); + expect(() => u(unicodeChar(0x110000))).toThrowErrorMatchingInlineSnapshot( + `""unicodeChar": expected valid unicode code point but got: 1114112"`, + ); + + expect(u(unicodeChar(0x10ffff))).toEqualRegex(/\u{10ffff}/u); +}); + test('`unicodeProp` pattern', () => { expect(unicodeProp('General_Category', 'Letter')).toEqualRegex(/\p{General_Category=Letter}/); expect(unicodeProp('Letter')).toEqualRegex(/\p{Letter}/); diff --git a/src/constructs/unicode.ts b/src/constructs/unicode.ts index de2aebd..9ecac17 100644 --- a/src/constructs/unicode.ts +++ b/src/constructs/unicode.ts @@ -13,12 +13,8 @@ import type { CharacterEscape } from '../types'; * @returns A character class representing the unicode escape. */ export function unicodeChar(codePoint: number): CharacterEscape { - if (!Number.isInteger(codePoint)) { - throw new TypeError('Expected an integer code point but got: ' + codePoint); - } - - if (codePoint < 0) { - throw new RangeError('Code point must be a positive integer but got: ' + codePoint); + if (!Number.isInteger(codePoint) || codePoint < 0 || codePoint > 0x10ffff) { + throw new RangeError('"unicodeChar": expected valid unicode code point but got: ' + codePoint); } let escape = From b5599026a24efe1eb90360726a99835a35aa5b64 Mon Sep 17 00:00:00 2001 From: Maciej Jastrzebski Date: Fri, 12 Jul 2024 16:34:46 +0200 Subject: [PATCH 05/13] refactor: add check for invalid unicode-aware feature usage --- src/__tests__/builder.test.ts | 16 ++- src/builders.ts | 6 ++ src/constructs/__tests__/unicode.test.tsx | 114 +++++++++++++--------- src/constructs/unicode.ts | 10 +- src/index.ts | 2 +- website/docs/api/unicode.md | 2 +- 6 files changed, 99 insertions(+), 51 deletions(-) diff --git a/src/__tests__/builder.test.ts b/src/__tests__/builder.test.ts index 7cf2797..90b76fe 100644 --- a/src/__tests__/builder.test.ts +++ b/src/__tests__/builder.test.ts @@ -1,4 +1,5 @@ -import { buildRegExp } from '..'; +import { buildRegExp, unicodeChar } from '..'; +import { unicodeProperty } from '../constructs/unicode'; test('`regexBuilder` flags', () => { expect(buildRegExp('a').flags).toBe(''); @@ -32,3 +33,16 @@ test('`regexBuilder` flags', () => { }).flags, ).toBe('gisy'); }); + +test('`regexBuilder` throws when using unicode-aware features without `unicode` flag', () => { + expect(() => buildRegExp(unicodeChar(0x1234))).not.toThrow(); + expect(() => buildRegExp(unicodeChar(0x12345), { unicode: true })).not.toThrow(); + expect(() => buildRegExp(unicodeProperty('Emoji_Presentation'), { unicode: true })).not.toThrow(); + + expect(() => buildRegExp(unicodeChar(0x123456))).toThrowErrorMatchingInlineSnapshot( + `""unicodeChar": expected valid unicode code point but got: 1193046"`, + ); + expect(() => + buildRegExp(unicodeProperty('Emoji_Presentation')), + ).toThrowErrorMatchingInlineSnapshot(`"Unicode-aware regex pattern requires "unicode" flag"`); +}); diff --git a/src/builders.ts b/src/builders.ts index 6f29acd..1d6a6b9 100644 --- a/src/builders.ts +++ b/src/builders.ts @@ -1,5 +1,6 @@ import type { RegexFlags, RegexSequence } from './types'; import { encode } from './encoder'; +import { hasUnicodeAwareRegex } from './constructs/unicode'; /** * Generate RegExp object from elements with optional flags. @@ -11,6 +12,11 @@ import { encode } from './encoder'; export function buildRegExp(sequence: RegexSequence, flags?: RegexFlags): RegExp { const pattern = encode(sequence).pattern; const flagsString = encodeFlags(flags ?? {}); + + if (hasUnicodeAwareRegex(pattern) && !flags?.unicode) { + throw new Error('Unicode-aware regex pattern requires "unicode" flag'); + } + return new RegExp(pattern, flagsString); } diff --git a/src/constructs/__tests__/unicode.test.tsx b/src/constructs/__tests__/unicode.test.tsx index 8bdae6b..b3fffed 100644 --- a/src/constructs/__tests__/unicode.test.tsx +++ b/src/constructs/__tests__/unicode.test.tsx @@ -5,8 +5,9 @@ import { type RegexSequence, startOfString, unicodeChar, - unicodeProp, + unicodeProperty, } from '../../index'; +import { hasUnicodeAwareRegex } from '../unicode'; function u(sequence: RegexSequence) { return buildRegExp(sequence, { unicode: true }); @@ -45,13 +46,10 @@ test('`unicodeChar` matching', () => { expect(unicodeChar(0x12)).toMatchString('\u{12}}'); expect(unicodeChar(0x123)).toMatchString('\u{123}'); expect(unicodeChar(0x1234)).toMatchString('\u{1234}}'); - expect(unicodeChar(0x12345)).not.toMatchString('\u{12345}'); - expect(unicodeChar(0x103456)).not.toMatchString('\u{103456}'); expect(unicodeChar('a'.codePointAt(0)!)).toMatchString('a'); expect(unicodeChar('ą'.codePointAt(0)!)).toMatchString('ą'); expect(unicodeChar('©'.codePointAt(0)!)).toMatchString('©'); - expect(unicodeChar('😎'.codePointAt(0)!)).not.toMatchString('😎'); expect(u(unicodeChar(0))).toMatchString('\u{0}'); expect(u(unicodeChar(0))).not.toMatchString('a'); @@ -98,56 +96,82 @@ test('`unicodeChar` edge cases handling', () => { expect(u(unicodeChar(0x10ffff))).toEqualRegex(/\u{10ffff}/u); }); -test('`unicodeProp` pattern', () => { - expect(unicodeProp('General_Category', 'Letter')).toEqualRegex(/\p{General_Category=Letter}/); - expect(unicodeProp('Letter')).toEqualRegex(/\p{Letter}/); - expect(unicodeProp('L')).toEqualRegex(/\p{L}/); - expect(unicodeProp('Lu')).toEqualRegex(/\p{Lu}/); - expect(unicodeProp('Ll')).toEqualRegex(/\p{Ll}/); - expect(unicodeProp('Lt')).toEqualRegex(/\p{Lt}/); - expect(unicodeProp('Lm')).toEqualRegex(/\p{Lm}/); - expect(unicodeProp('Lo')).toEqualRegex(/\p{Lo}/); - - expect(unicodeProp('Script', 'Latin')).toEqualRegex('\\p{Script=Latin}'); - expect(unicodeProp('Script', 'Grek')).toEqualRegex('\\p{Script=Grek}'); - expect(unicodeProp('sc', 'Cyrillic')).toEqualRegex('\\p{sc=Cyrillic}'); - - expect(unicodeProp('Script', 'Thaana')).toEqualRegex('\\p{Script=Thaana}'); - expect(unicodeProp('Script_Extensions', 'Thaana')).toEqualRegex('\\p{Script_Extensions=Thaana}'); - expect(unicodeProp('scx', 'Thaana')).toEqualRegex('\\p{scx=Thaana}'); - - expect(unicodeProp('Emoji')).toEqualRegex('\\p{Emoji}'); +test('`unicodeProperty` pattern', () => { + expect(u(unicodeProperty('General_Category', 'Letter'))).toEqualRegex( + /\p{General_Category=Letter}/u, + ); + expect(u(unicodeProperty('Letter'))).toEqualRegex(/\p{Letter}/u); + expect(u(unicodeProperty('L'))).toEqualRegex(/\p{L}/u); + expect(u(unicodeProperty('Lu'))).toEqualRegex(/\p{Lu}/u); + expect(u(unicodeProperty('Ll'))).toEqualRegex(/\p{Ll}/u); + expect(u(unicodeProperty('Lt'))).toEqualRegex(/\p{Lt}/u); + expect(u(unicodeProperty('Lm'))).toEqualRegex(/\p{Lm}/u); + expect(u(unicodeProperty('Lo'))).toEqualRegex(/\p{Lo}/u); + + expect(u(unicodeProperty('Script', 'Latin'))).toEqualRegex('\\p{Script=Latin}'); + expect(u(unicodeProperty('Script', 'Grek'))).toEqualRegex('\\p{Script=Grek}'); + expect(u(unicodeProperty('sc', 'Cyrillic'))).toEqualRegex('\\p{sc=Cyrillic}'); + + expect(u(unicodeProperty('Script', 'Thaana'))).toEqualRegex('\\p{Script=Thaana}'); + expect(u(unicodeProperty('Script_Extensions', 'Thaana'))).toEqualRegex( + '\\p{Script_Extensions=Thaana}', + ); + expect(u(unicodeProperty('scx', 'Thaana'))).toEqualRegex('\\p{scx=Thaana}'); + + expect(u(unicodeProperty('Emoji'))).toEqualRegex('\\p{Emoji}'); }); -test('`unicodeProp` matching', () => { - expect(u(unicodeProp('General_Category', 'Letter'))).toMatchString('A'); - expect(u(unicodeProp('Letter'))).toMatchString('A'); - expect(u(unicodeProp('L'))).toMatchString('A'); +test('`unicodeProperty` matching', () => { + expect(u(unicodeProperty('General_Category', 'Letter'))).toMatchString('A'); + expect(u(unicodeProperty('Letter'))).toMatchString('A'); + expect(u(unicodeProperty('L'))).toMatchString('A'); - expect(u(unicodeProp('Uppercase'))).toMatchString('A'); - expect(u(unicodeProp('Uppercase'))).not.toMatchString('a'); - expect(u(unicodeProp('Lu'))).toMatchString('A'); + expect(u(unicodeProperty('Uppercase'))).toMatchString('A'); + expect(u(unicodeProperty('Uppercase'))).not.toMatchString('a'); + expect(u(unicodeProperty('Lu'))).toMatchString('A'); - expect(u(unicodeProp('Lowercase'))).toMatchString('a'); - expect(u(unicodeProp('Lowercase'))).not.toMatchString('A'); - expect(u(unicodeProp('Ll'))).toMatchString('a'); + expect(u(unicodeProperty('Lowercase'))).toMatchString('a'); + expect(u(unicodeProperty('Lowercase'))).not.toMatchString('A'); + expect(u(unicodeProperty('Ll'))).toMatchString('a'); - expect(u(unicodeProp('Script', 'Latin'))).toMatchString('A'); - expect(u(unicodeProp('Script', 'Latin'))).not.toMatchString('α'); - expect(u(unicodeProp('Script', 'Grek'))).toMatchString('α'); - expect(u(unicodeProp('Script', 'Grek'))).not.toMatchString('A'); + expect(u(unicodeProperty('Script', 'Latin'))).toMatchString('A'); + expect(u(unicodeProperty('Script', 'Latin'))).not.toMatchString('α'); + expect(u(unicodeProperty('Script', 'Grek'))).toMatchString('α'); + expect(u(unicodeProperty('Script', 'Grek'))).not.toMatchString('A'); // Basic emoji - expect(u([startOfString, unicodeProp('Emoji'), endOfString])).toMatchString('😎'); - expect(u([startOfString, unicodeProp('Emoji'), endOfString])).toMatchString('🐌'); + expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).toMatchString('😎'); + expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).toMatchString('🐌'); // Complex emoji with skin tone modifier - expect(u(unicodeProp('Emoji'))).toMatchString('☝🏼'); - expect(u([startOfString, unicodeProp('Emoji'), endOfString])).not.toMatchString('☝🏼'); + expect(u(unicodeProperty('Emoji'))).toMatchString('☝🏼'); + expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).not.toMatchString('☝🏼'); +}); + +test('`unicodeProperty` nesting matching', () => { + expect(u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space')))).toMatchString( + 'a', + ); + expect(u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space')))).toMatchString( + ' ', + ); + expect( + u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space'))), + ).not.toMatchString('A'); }); -test('`unicodeProp` nesting matching', () => { - expect(u(charClass(unicodeProp('Lowercase'), unicodeProp('White_Space')))).toMatchString('a'); - expect(u(charClass(unicodeProp('Lowercase'), unicodeProp('White_Space')))).toMatchString(' '); - expect(u(charClass(unicodeProp('Lowercase'), unicodeProp('White_Space')))).not.toMatchString('A'); +test('has unicode-aware regex', () => { + expect(hasUnicodeAwareRegex(/\p{Emoji_Presentation}/u.source)).toBe(true); + expect(hasUnicodeAwareRegex(/aaa\p{Emoji_Presentation}/u.source)).toBe(true); + expect(hasUnicodeAwareRegex(/\p{Emoji_Presentation}bbb/u.source)).toBe(true); + expect(hasUnicodeAwareRegex(/\u{123}/u.source)).toBe(true); + expect(hasUnicodeAwareRegex(/\u{01234}/u.source)).toBe(true); + expect(hasUnicodeAwareRegex(/aaa\u{01234}/u.source)).toBe(true); + expect(hasUnicodeAwareRegex(/\u{01234}bbb/u.source)).toBe(true); + + expect(hasUnicodeAwareRegex(/\x23/.source)).toBe(false); + expect(hasUnicodeAwareRegex(/\u0123/.source)).toBe(false); + expect(hasUnicodeAwareRegex(/\u1f60/.source)).toBe(false); + expect(hasUnicodeAwareRegex(/a/.source)).toBe(false); + expect(hasUnicodeAwareRegex(/abc/.source)).toBe(false); }); diff --git a/src/constructs/unicode.ts b/src/constructs/unicode.ts index 9ecac17..f0b3cdd 100644 --- a/src/constructs/unicode.ts +++ b/src/constructs/unicode.ts @@ -35,12 +35,12 @@ export function unicodeChar(codePoint: number): CharacterEscape { * Regex pattern: `\p{Property}` or `\p{Property=Value}` * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape * - * @param prop Unicode property name. + * @param property Unicode property name. * @param value Unicode property value (optional). * @returns A character class representing the unicode property escape. */ -export function unicodeProp(prop: string, value?: string): CharacterEscape { - const escape = `\\p{${prop}${value ? `=${value}` : ''}}`; +export function unicodeProperty(property: string, value?: string): CharacterEscape { + const escape = `\\p{${property}${value ? `=${value}` : ''}}`; return { precedence: 'atom', @@ -48,3 +48,7 @@ export function unicodeProp(prop: string, value?: string): CharacterEscape { chars: [escape], }; } + +export function hasUnicodeAwareRegex(pattern: string): boolean { + return /\\(u|p)\{/.test(pattern); +} diff --git a/src/index.ts b/src/index.ts index 9ca3523..4ef5c5c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -37,4 +37,4 @@ export { negativeLookbehind } from './constructs/negative-lookbehind'; export { oneOrMore, optional, zeroOrMore } from './constructs/quantifiers'; export { regex } from './constructs/regex'; export { repeat } from './constructs/repeat'; -export { unicodeChar, unicodeProp } from './constructs/unicode'; +export { unicodeChar, unicodeProperty } from './constructs/unicode'; diff --git a/website/docs/api/unicode.md b/website/docs/api/unicode.md index d3a9a56..b1f3386 100644 --- a/website/docs/api/unicode.md +++ b/website/docs/api/unicode.md @@ -23,7 +23,7 @@ Note: for code points above 0xFFFF, the regex engine must be [unicode-aware](htt ### Unicode character class escapes ```ts -function unicodeProp(prop: string, value?: string): CharacterEscape; +function unicodeProperty(property: string, value?: string): CharacterEscape; ``` Unicode character class escape matching a set of characters specified by a Unicode property. From a4b776e6ffb2d996dba3d22d506ce9d9c691ff4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Jastrze=CC=A8bski?= Date: Fri, 6 Sep 2024 21:24:34 +0200 Subject: [PATCH 06/13] refactor: tweaks --- src/__tests__/builder.test.ts | 12 ++++++++-- src/builders.ts | 11 ++++++--- src/constructs/__tests__/unicode.test.tsx | 25 ++++--------------- src/constructs/unicode.ts | 29 ++++++++++++++++++++--- 4 files changed, 48 insertions(+), 29 deletions(-) diff --git a/src/__tests__/builder.test.ts b/src/__tests__/builder.test.ts index 90b76fe..1668bbd 100644 --- a/src/__tests__/builder.test.ts +++ b/src/__tests__/builder.test.ts @@ -40,9 +40,17 @@ test('`regexBuilder` throws when using unicode-aware features without `unicode` expect(() => buildRegExp(unicodeProperty('Emoji_Presentation'), { unicode: true })).not.toThrow(); expect(() => buildRegExp(unicodeChar(0x123456))).toThrowErrorMatchingInlineSnapshot( - `""unicodeChar": expected valid unicode code point but got: 1193046"`, + `""unicodeChar": expected a valid unicode code point but got: 1193046"`, + ); + expect(() => buildRegExp(unicodeChar(0x12345))).toThrowErrorMatchingInlineSnapshot( + `"The pattern "\\u{12345}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`, ); expect(() => buildRegExp(unicodeProperty('Emoji_Presentation')), - ).toThrowErrorMatchingInlineSnapshot(`"Unicode-aware regex pattern requires "unicode" flag"`); + ).toThrowErrorMatchingInlineSnapshot( + `"The pattern "\\p{Emoji_Presentation}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`, + ); + expect(() => buildRegExp(/\P{Letter}/u)).toThrowErrorMatchingInlineSnapshot( + `"The pattern "\\P{Letter}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`, + ); }); diff --git a/src/builders.ts b/src/builders.ts index 1d6a6b9..b8e9d7c 100644 --- a/src/builders.ts +++ b/src/builders.ts @@ -1,6 +1,6 @@ import type { RegexFlags, RegexSequence } from './types'; import { encode } from './encoder'; -import { hasUnicodeAwareRegex } from './constructs/unicode'; +import { getFirstUnicodeAwarePattern } from './constructs/unicode'; /** * Generate RegExp object from elements with optional flags. @@ -13,8 +13,13 @@ export function buildRegExp(sequence: RegexSequence, flags?: RegexFlags): RegExp const pattern = encode(sequence).pattern; const flagsString = encodeFlags(flags ?? {}); - if (hasUnicodeAwareRegex(pattern) && !flags?.unicode) { - throw new Error('Unicode-aware regex pattern requires "unicode" flag'); + if (!flags?.unicode) { + const unicodePattern = getFirstUnicodeAwarePattern(pattern); + if (unicodePattern) { + throw new Error( + `The pattern "${unicodePattern}" requires Unicode-aware mode. Please ensure the "unicode" flag is set.`, + ); + } } return new RegExp(pattern, flagsString); diff --git a/src/constructs/__tests__/unicode.test.tsx b/src/constructs/__tests__/unicode.test.tsx index b3fffed..64873dd 100644 --- a/src/constructs/__tests__/unicode.test.tsx +++ b/src/constructs/__tests__/unicode.test.tsx @@ -7,7 +7,6 @@ import { unicodeChar, unicodeProperty, } from '../../index'; -import { hasUnicodeAwareRegex } from '../unicode'; function u(sequence: RegexSequence) { return buildRegExp(sequence, { unicode: true }); @@ -81,16 +80,16 @@ test('`unicodeChar` nesting matching', () => { test('`unicodeChar` edge cases handling', () => { expect(() => u(unicodeChar(NaN))).toThrowErrorMatchingInlineSnapshot( - `""unicodeChar": expected valid unicode code point but got: NaN"`, + `""unicodeChar": expected a valid unicode code point but got: NaN"`, ); expect(() => u(unicodeChar(1.5))).toThrowErrorMatchingInlineSnapshot( - `""unicodeChar": expected valid unicode code point but got: 1.5"`, + `""unicodeChar": expected a valid unicode code point but got: 1.5"`, ); expect(() => u(unicodeChar(-1))).toThrowErrorMatchingInlineSnapshot( - `""unicodeChar": expected valid unicode code point but got: -1"`, + `""unicodeChar": expected a valid unicode code point but got: -1"`, ); expect(() => u(unicodeChar(0x110000))).toThrowErrorMatchingInlineSnapshot( - `""unicodeChar": expected valid unicode code point but got: 1114112"`, + `""unicodeChar": expected a valid unicode code point but got: 1114112"`, ); expect(u(unicodeChar(0x10ffff))).toEqualRegex(/\u{10ffff}/u); @@ -159,19 +158,3 @@ test('`unicodeProperty` nesting matching', () => { u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space'))), ).not.toMatchString('A'); }); - -test('has unicode-aware regex', () => { - expect(hasUnicodeAwareRegex(/\p{Emoji_Presentation}/u.source)).toBe(true); - expect(hasUnicodeAwareRegex(/aaa\p{Emoji_Presentation}/u.source)).toBe(true); - expect(hasUnicodeAwareRegex(/\p{Emoji_Presentation}bbb/u.source)).toBe(true); - expect(hasUnicodeAwareRegex(/\u{123}/u.source)).toBe(true); - expect(hasUnicodeAwareRegex(/\u{01234}/u.source)).toBe(true); - expect(hasUnicodeAwareRegex(/aaa\u{01234}/u.source)).toBe(true); - expect(hasUnicodeAwareRegex(/\u{01234}bbb/u.source)).toBe(true); - - expect(hasUnicodeAwareRegex(/\x23/.source)).toBe(false); - expect(hasUnicodeAwareRegex(/\u0123/.source)).toBe(false); - expect(hasUnicodeAwareRegex(/\u1f60/.source)).toBe(false); - expect(hasUnicodeAwareRegex(/a/.source)).toBe(false); - expect(hasUnicodeAwareRegex(/abc/.source)).toBe(false); -}); diff --git a/src/constructs/unicode.ts b/src/constructs/unicode.ts index f0b3cdd..d381e47 100644 --- a/src/constructs/unicode.ts +++ b/src/constructs/unicode.ts @@ -14,7 +14,7 @@ import type { CharacterEscape } from '../types'; */ export function unicodeChar(codePoint: number): CharacterEscape { if (!Number.isInteger(codePoint) || codePoint < 0 || codePoint > 0x10ffff) { - throw new RangeError('"unicodeChar": expected valid unicode code point but got: ' + codePoint); + throw new RangeError(`Expected a valid unicode code point but received ${codePoint}`); } let escape = @@ -49,6 +49,29 @@ export function unicodeProperty(property: string, value?: string): CharacterEsca }; } -export function hasUnicodeAwareRegex(pattern: string): boolean { - return /\\(u|p)\{/.test(pattern); +/** + * Unicode character class escape matching a set of characters not specified by a Unicode property. + * + * Regex pattern: `\P{Property}` or `\P{Property=Value}` + * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape + * + * @param property Unicode property name. + * @param value Unicode property value (optional). + * @returns A character class representing the complement of the unicode property escape. + */ +export function unicodePropertyComplement(property: string, value?: string): CharacterEscape { + const escape = `\\P{${property}${value ? `=${value}` : ''}}`; + + return { + precedence: 'atom', + pattern: escape, + chars: [escape], + }; +} + +const unicodeModeRegex = /(?:\\u|\\p|\\P)\{.+?\}/; + +export function getFirstUnicodeAwarePattern(pattern: string): string | null { + const match = pattern.match(unicodeModeRegex); + return match?.[0] ?? null; } From c05afdd5af0a06b7885a05a8205b956ccca2b5b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Jastrze=CC=A8bski?= Date: Fri, 6 Sep 2024 22:33:01 +0200 Subject: [PATCH 07/13] refactor: naming tweaks --- README.md | 7 +- src/__tests__/builder.test.ts | 19 +-- src/builders.ts | 14 +- .../__tests__/char-escape-unicode.test.tsx | 150 ++++++++++++++++ src/constructs/__tests__/unicode.test.tsx | 160 ------------------ src/constructs/char-class.ts | 53 +++++- src/constructs/char-escape.ts | 51 ++++++ src/constructs/unicode.ts | 77 --------- src/index.ts | 19 ++- src/types.ts | 4 +- website/docs/api/unicode.md | 11 +- 11 files changed, 293 insertions(+), 272 deletions(-) create mode 100644 src/constructs/__tests__/char-escape-unicode.test.tsx delete mode 100644 src/constructs/__tests__/unicode.test.tsx delete mode 100644 src/constructs/unicode.ts diff --git a/README.md b/README.md index d3c0fa7..1b90147 100644 --- a/README.md +++ b/README.md @@ -177,9 +177,12 @@ TS Regex Builder is inspired by [Swift Regex Builder API](https://developer.appl ## Reference -- [ECMAScript Regular Expression BNF Grammar](https://262.ecma-international.org/7.0/#sec-regular-expressions) -- [Swift Regex Builder API docs](https://developer.apple.com/documentation/regexbuilder) +- [ECMAScript Regular Expression BNF Grammar](https://tc39.es/ecma262/#sec-regular-expressions) +- [Unicode Regular Expressions](https://www.unicode.org/reports/tr18/) - [Swift Evolution 351: Regex Builder DSL](https://github.com/apple/swift-evolution/blob/main/proposals/0351-regex-builder.md) +- [Swift Regex Builder API docs](https://developer.apple.com/documentation/regexbuilder) + + --- diff --git a/src/__tests__/builder.test.ts b/src/__tests__/builder.test.ts index 1668bbd..baf9150 100644 --- a/src/__tests__/builder.test.ts +++ b/src/__tests__/builder.test.ts @@ -1,5 +1,4 @@ -import { buildRegExp, unicodeChar } from '..'; -import { unicodeProperty } from '../constructs/unicode'; +import { buildRegExp, char, charProperty } from '..'; test('`regexBuilder` flags', () => { expect(buildRegExp('a').flags).toBe(''); @@ -35,19 +34,17 @@ test('`regexBuilder` flags', () => { }); test('`regexBuilder` throws when using unicode-aware features without `unicode` flag', () => { - expect(() => buildRegExp(unicodeChar(0x1234))).not.toThrow(); - expect(() => buildRegExp(unicodeChar(0x12345), { unicode: true })).not.toThrow(); - expect(() => buildRegExp(unicodeProperty('Emoji_Presentation'), { unicode: true })).not.toThrow(); + expect(() => buildRegExp(char(0x1234))).not.toThrow(); + expect(() => buildRegExp(char(0x12345), { unicode: true })).not.toThrow(); + expect(() => buildRegExp(charProperty('Emoji_Presentation'), { unicode: true })).not.toThrow(); - expect(() => buildRegExp(unicodeChar(0x123456))).toThrowErrorMatchingInlineSnapshot( - `""unicodeChar": expected a valid unicode code point but got: 1193046"`, + expect(() => buildRegExp(char(0x123456))).toThrowErrorMatchingInlineSnapshot( + `"Expected a valid unicode code point but received 1193046"`, ); - expect(() => buildRegExp(unicodeChar(0x12345))).toThrowErrorMatchingInlineSnapshot( + expect(() => buildRegExp(char(0x12345))).toThrowErrorMatchingInlineSnapshot( `"The pattern "\\u{12345}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`, ); - expect(() => - buildRegExp(unicodeProperty('Emoji_Presentation')), - ).toThrowErrorMatchingInlineSnapshot( + expect(() => buildRegExp(charProperty('Emoji_Presentation'))).toThrowErrorMatchingInlineSnapshot( `"The pattern "\\p{Emoji_Presentation}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`, ); expect(() => buildRegExp(/\P{Letter}/u)).toThrowErrorMatchingInlineSnapshot( diff --git a/src/builders.ts b/src/builders.ts index b8e9d7c..482392f 100644 --- a/src/builders.ts +++ b/src/builders.ts @@ -1,6 +1,5 @@ import type { RegexFlags, RegexSequence } from './types'; import { encode } from './encoder'; -import { getFirstUnicodeAwarePattern } from './constructs/unicode'; /** * Generate RegExp object from elements with optional flags. @@ -14,10 +13,10 @@ export function buildRegExp(sequence: RegexSequence, flags?: RegexFlags): RegExp const flagsString = encodeFlags(flags ?? {}); if (!flags?.unicode) { - const unicodePattern = getFirstUnicodeAwarePattern(pattern); - if (unicodePattern) { + const unicodeModePattern = getUnicodeModePattern(pattern); + if (unicodeModePattern) { throw new Error( - `The pattern "${unicodePattern}" requires Unicode-aware mode. Please ensure the "unicode" flag is set.`, + `The pattern "${unicodeModePattern}" requires Unicode-aware mode. Please ensure the "unicode" flag is set.`, ); } } @@ -47,3 +46,10 @@ function encodeFlags(flags: RegexFlags): string { return result; } + +const unicodeModePatterns = /(?:\\u|\\p|\\P)\{.+?\}/; + +function getUnicodeModePattern(pattern: string): string | null { + const match = pattern.match(unicodeModePatterns); + return match?.[0] ?? null; +} diff --git a/src/constructs/__tests__/char-escape-unicode.test.tsx b/src/constructs/__tests__/char-escape-unicode.test.tsx new file mode 100644 index 0000000..b79e7cb --- /dev/null +++ b/src/constructs/__tests__/char-escape-unicode.test.tsx @@ -0,0 +1,150 @@ +import { + buildRegExp, + char, + charClass, + charProperty, + endOfString, + type RegexSequence, + startOfString, +} from '../..'; + +function u(sequence: RegexSequence) { + return buildRegExp(sequence, { unicode: true }); +} + +test('`char` pattern', () => { + // eslint-disable-next-line no-control-regex + expect(char(0)).toEqualRegex(/\u0000/); + // eslint-disable-next-line no-control-regex + expect(char(0x1)).toEqualRegex(/\u0001/); + // eslint-disable-next-line no-control-regex + expect(char(0x12)).toEqualRegex(/\u0012/); + expect(char(0x123)).toEqualRegex(/\u0123/); + expect(char(0x1234)).toEqualRegex(/\u1234/); + + // eslint-disable-next-line no-control-regex + expect(u(char(0))).toEqualRegex(new RegExp('\\u0000', 'u')); + // eslint-disable-next-line no-control-regex + expect(u(char(0x1))).toEqualRegex(new RegExp('\\u0001', 'u')); + expect(u(char(0x12))).toEqualRegex( + // eslint-disable-next-line no-control-regex + new RegExp('\\u0012', 'u'), + ); + expect(char(0x0123)).toEqualRegex(/\u0123/); + expect(char(0x1234)).toEqualRegex(/\u1234/); + + expect(u(char(0x0123))).toEqualRegex(/\u0123/u); + expect(u(char(0x1234))).toEqualRegex(/\u1234/u); + expect(u(char(0x12345))).toEqualRegex(new RegExp('\\u{12345}', 'u')); + expect(u(char(0x103456))).toEqualRegex(new RegExp('\\u{103456}', 'u')); +}); + +test('`char` matching', () => { + expect(char(0)).toMatchString('\u{0}'); + expect(char(0x1)).toMatchString('\u{1}'); + expect(char(0x12)).toMatchString('\u{12}}'); + expect(char(0x123)).toMatchString('\u{123}'); + expect(char(0x1234)).toMatchString('\u{1234}}'); + + expect(char('a'.codePointAt(0)!)).toMatchString('a'); + expect(char('ą'.codePointAt(0)!)).toMatchString('ą'); + expect(char('©'.codePointAt(0)!)).toMatchString('©'); + + expect(u(char(0))).toMatchString('\u{0}'); + expect(u(char(0))).not.toMatchString('a'); + expect(u(char(0x1))).toMatchString('\u{1}'); + expect(u(char(0x12))).toMatchString('\u{12}'); + expect(u(char(0x123))).toMatchString('\u{123}'); + expect(u(char(0x1234))).toMatchString('\u{1234}'); + expect(u(char(0x12345))).toMatchString('\u{12345}'); + expect(u(char(0x103456))).toMatchString('\u{103456}'); + + expect(u(char('a'.codePointAt(0)!))).toMatchString('a'); + expect(u(char('ą'.codePointAt(0)!))).toMatchString('ą'); + expect(u(char('©'.codePointAt(0)!))).toMatchString('©'); + expect(u(char('😎'.codePointAt(0)!))).toMatchString('😎'); + expect(u(char('😎'.codePointAt(0)!))).toMatchString('\u{1f60e}'); +}); + +test('`char` nesting matching', () => { + expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).toMatchString('a'); + expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).toMatchString('ą'); + expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).not.toMatchString('b'); +}); + +test('`char` edge cases handling', () => { + expect(() => u(char(NaN))).toThrowErrorMatchingInlineSnapshot( + `"Expected a valid unicode code point but received NaN"`, + ); + expect(() => u(char(1.5))).toThrowErrorMatchingInlineSnapshot( + `"Expected a valid unicode code point but received 1.5"`, + ); + expect(() => u(char(-1))).toThrowErrorMatchingInlineSnapshot( + `"Expected a valid unicode code point but received -1"`, + ); + expect(() => u(char(0x110000))).toThrowErrorMatchingInlineSnapshot( + `"Expected a valid unicode code point but received 1114112"`, + ); + + expect(u(char(0x10ffff))).toEqualRegex(/\u{10ffff}/u); +}); + +test('`charProperty` pattern', () => { + expect(u(charProperty('General_Category', 'Letter'))).toEqualRegex( + /\p{General_Category=Letter}/u, + ); + expect(u(charProperty('Letter'))).toEqualRegex(/\p{Letter}/u); + expect(u(charProperty('L'))).toEqualRegex(/\p{L}/u); + expect(u(charProperty('Lu'))).toEqualRegex(/\p{Lu}/u); + expect(u(charProperty('Ll'))).toEqualRegex(/\p{Ll}/u); + expect(u(charProperty('Lt'))).toEqualRegex(/\p{Lt}/u); + expect(u(charProperty('Lm'))).toEqualRegex(/\p{Lm}/u); + expect(u(charProperty('Lo'))).toEqualRegex(/\p{Lo}/u); + + expect(u(charProperty('Script', 'Latin'))).toEqualRegex('\\p{Script=Latin}'); + expect(u(charProperty('Script', 'Grek'))).toEqualRegex('\\p{Script=Grek}'); + expect(u(charProperty('sc', 'Cyrillic'))).toEqualRegex('\\p{sc=Cyrillic}'); + + expect(u(charProperty('Script', 'Thaana'))).toEqualRegex('\\p{Script=Thaana}'); + expect(u(charProperty('Script_Extensions', 'Thaana'))).toEqualRegex( + '\\p{Script_Extensions=Thaana}', + ); + expect(u(charProperty('scx', 'Thaana'))).toEqualRegex('\\p{scx=Thaana}'); + + expect(u(charProperty('Emoji'))).toEqualRegex('\\p{Emoji}'); +}); + +test('`charProperty` matching', () => { + expect(u(charProperty('General_Category', 'Letter'))).toMatchString('A'); + expect(u(charProperty('Letter'))).toMatchString('A'); + expect(u(charProperty('L'))).toMatchString('A'); + + expect(u(charProperty('Uppercase'))).toMatchString('A'); + expect(u(charProperty('Uppercase'))).not.toMatchString('a'); + expect(u(charProperty('Lu'))).toMatchString('A'); + + expect(u(charProperty('Lowercase'))).toMatchString('a'); + expect(u(charProperty('Lowercase'))).not.toMatchString('A'); + expect(u(charProperty('Ll'))).toMatchString('a'); + + expect(u(charProperty('Script', 'Latin'))).toMatchString('A'); + expect(u(charProperty('Script', 'Latin'))).not.toMatchString('α'); + expect(u(charProperty('Script', 'Grek'))).toMatchString('α'); + expect(u(charProperty('Script', 'Grek'))).not.toMatchString('A'); + + // Basic emoji + expect(u([startOfString, charProperty('Emoji'), endOfString])).toMatchString('😎'); + expect(u([startOfString, charProperty('Emoji'), endOfString])).toMatchString('🐌'); + + // Complex emoji with skin tone modifier + expect(u(charProperty('Emoji'))).toMatchString('☝🏼'); + expect(u([startOfString, charProperty('Emoji'), endOfString])).not.toMatchString('☝🏼'); +}); + +test('`charProperty` nesting matching', () => { + expect(u(charClass(charProperty('Lowercase'), charProperty('White_Space')))).toMatchString('a'); + expect(u(charClass(charProperty('Lowercase'), charProperty('White_Space')))).toMatchString(' '); + expect(u(charClass(charProperty('Lowercase'), charProperty('White_Space')))).not.toMatchString( + 'A', + ); +}); diff --git a/src/constructs/__tests__/unicode.test.tsx b/src/constructs/__tests__/unicode.test.tsx deleted file mode 100644 index 64873dd..0000000 --- a/src/constructs/__tests__/unicode.test.tsx +++ /dev/null @@ -1,160 +0,0 @@ -import { - buildRegExp, - charClass, - endOfString, - type RegexSequence, - startOfString, - unicodeChar, - unicodeProperty, -} from '../../index'; - -function u(sequence: RegexSequence) { - return buildRegExp(sequence, { unicode: true }); -} - -test('`unicodeChar` pattern', () => { - // eslint-disable-next-line no-control-regex - expect(unicodeChar(0)).toEqualRegex(/\u0000/); - // eslint-disable-next-line no-control-regex - expect(unicodeChar(0x1)).toEqualRegex(/\u0001/); - // eslint-disable-next-line no-control-regex - expect(unicodeChar(0x12)).toEqualRegex(/\u0012/); - expect(unicodeChar(0x123)).toEqualRegex(/\u0123/); - expect(unicodeChar(0x1234)).toEqualRegex(/\u1234/); - - // eslint-disable-next-line no-control-regex - expect(u(unicodeChar(0))).toEqualRegex(new RegExp('\\u0000', 'u')); - // eslint-disable-next-line no-control-regex - expect(u(unicodeChar(0x1))).toEqualRegex(new RegExp('\\u0001', 'u')); - expect(u(unicodeChar(0x12))).toEqualRegex( - // eslint-disable-next-line no-control-regex - new RegExp('\\u0012', 'u'), - ); - expect(unicodeChar(0x0123)).toEqualRegex(/\u0123/); - expect(unicodeChar(0x1234)).toEqualRegex(/\u1234/); - - expect(u(unicodeChar(0x0123))).toEqualRegex(/\u0123/u); - expect(u(unicodeChar(0x1234))).toEqualRegex(/\u1234/u); - expect(u(unicodeChar(0x12345))).toEqualRegex(new RegExp('\\u{12345}', 'u')); - expect(u(unicodeChar(0x103456))).toEqualRegex(new RegExp('\\u{103456}', 'u')); -}); - -test('`unicodeChar` matching', () => { - expect(unicodeChar(0)).toMatchString('\u{0}'); - expect(unicodeChar(0x1)).toMatchString('\u{1}'); - expect(unicodeChar(0x12)).toMatchString('\u{12}}'); - expect(unicodeChar(0x123)).toMatchString('\u{123}'); - expect(unicodeChar(0x1234)).toMatchString('\u{1234}}'); - - expect(unicodeChar('a'.codePointAt(0)!)).toMatchString('a'); - expect(unicodeChar('ą'.codePointAt(0)!)).toMatchString('ą'); - expect(unicodeChar('©'.codePointAt(0)!)).toMatchString('©'); - - expect(u(unicodeChar(0))).toMatchString('\u{0}'); - expect(u(unicodeChar(0))).not.toMatchString('a'); - expect(u(unicodeChar(0x1))).toMatchString('\u{1}'); - expect(u(unicodeChar(0x12))).toMatchString('\u{12}'); - expect(u(unicodeChar(0x123))).toMatchString('\u{123}'); - expect(u(unicodeChar(0x1234))).toMatchString('\u{1234}'); - expect(u(unicodeChar(0x12345))).toMatchString('\u{12345}'); - expect(u(unicodeChar(0x103456))).toMatchString('\u{103456}'); - - expect(u(unicodeChar('a'.codePointAt(0)!))).toMatchString('a'); - expect(u(unicodeChar('ą'.codePointAt(0)!))).toMatchString('ą'); - expect(u(unicodeChar('©'.codePointAt(0)!))).toMatchString('©'); - expect(u(unicodeChar('😎'.codePointAt(0)!))).toMatchString('😎'); - expect(u(unicodeChar('😎'.codePointAt(0)!))).toMatchString('\u{1f60e}'); -}); - -test('`unicodeChar` nesting matching', () => { - expect( - u(charClass(unicodeChar('a'.codePointAt(0)!), unicodeChar('ą'.codePointAt(0)!))), - ).toMatchString('a'); - expect( - u(charClass(unicodeChar('a'.codePointAt(0)!), unicodeChar('ą'.codePointAt(0)!))), - ).toMatchString('ą'); - expect( - u(charClass(unicodeChar('a'.codePointAt(0)!), unicodeChar('ą'.codePointAt(0)!))), - ).not.toMatchString('b'); -}); - -test('`unicodeChar` edge cases handling', () => { - expect(() => u(unicodeChar(NaN))).toThrowErrorMatchingInlineSnapshot( - `""unicodeChar": expected a valid unicode code point but got: NaN"`, - ); - expect(() => u(unicodeChar(1.5))).toThrowErrorMatchingInlineSnapshot( - `""unicodeChar": expected a valid unicode code point but got: 1.5"`, - ); - expect(() => u(unicodeChar(-1))).toThrowErrorMatchingInlineSnapshot( - `""unicodeChar": expected a valid unicode code point but got: -1"`, - ); - expect(() => u(unicodeChar(0x110000))).toThrowErrorMatchingInlineSnapshot( - `""unicodeChar": expected a valid unicode code point but got: 1114112"`, - ); - - expect(u(unicodeChar(0x10ffff))).toEqualRegex(/\u{10ffff}/u); -}); - -test('`unicodeProperty` pattern', () => { - expect(u(unicodeProperty('General_Category', 'Letter'))).toEqualRegex( - /\p{General_Category=Letter}/u, - ); - expect(u(unicodeProperty('Letter'))).toEqualRegex(/\p{Letter}/u); - expect(u(unicodeProperty('L'))).toEqualRegex(/\p{L}/u); - expect(u(unicodeProperty('Lu'))).toEqualRegex(/\p{Lu}/u); - expect(u(unicodeProperty('Ll'))).toEqualRegex(/\p{Ll}/u); - expect(u(unicodeProperty('Lt'))).toEqualRegex(/\p{Lt}/u); - expect(u(unicodeProperty('Lm'))).toEqualRegex(/\p{Lm}/u); - expect(u(unicodeProperty('Lo'))).toEqualRegex(/\p{Lo}/u); - - expect(u(unicodeProperty('Script', 'Latin'))).toEqualRegex('\\p{Script=Latin}'); - expect(u(unicodeProperty('Script', 'Grek'))).toEqualRegex('\\p{Script=Grek}'); - expect(u(unicodeProperty('sc', 'Cyrillic'))).toEqualRegex('\\p{sc=Cyrillic}'); - - expect(u(unicodeProperty('Script', 'Thaana'))).toEqualRegex('\\p{Script=Thaana}'); - expect(u(unicodeProperty('Script_Extensions', 'Thaana'))).toEqualRegex( - '\\p{Script_Extensions=Thaana}', - ); - expect(u(unicodeProperty('scx', 'Thaana'))).toEqualRegex('\\p{scx=Thaana}'); - - expect(u(unicodeProperty('Emoji'))).toEqualRegex('\\p{Emoji}'); -}); - -test('`unicodeProperty` matching', () => { - expect(u(unicodeProperty('General_Category', 'Letter'))).toMatchString('A'); - expect(u(unicodeProperty('Letter'))).toMatchString('A'); - expect(u(unicodeProperty('L'))).toMatchString('A'); - - expect(u(unicodeProperty('Uppercase'))).toMatchString('A'); - expect(u(unicodeProperty('Uppercase'))).not.toMatchString('a'); - expect(u(unicodeProperty('Lu'))).toMatchString('A'); - - expect(u(unicodeProperty('Lowercase'))).toMatchString('a'); - expect(u(unicodeProperty('Lowercase'))).not.toMatchString('A'); - expect(u(unicodeProperty('Ll'))).toMatchString('a'); - - expect(u(unicodeProperty('Script', 'Latin'))).toMatchString('A'); - expect(u(unicodeProperty('Script', 'Latin'))).not.toMatchString('α'); - expect(u(unicodeProperty('Script', 'Grek'))).toMatchString('α'); - expect(u(unicodeProperty('Script', 'Grek'))).not.toMatchString('A'); - - // Basic emoji - expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).toMatchString('😎'); - expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).toMatchString('🐌'); - - // Complex emoji with skin tone modifier - expect(u(unicodeProperty('Emoji'))).toMatchString('☝🏼'); - expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).not.toMatchString('☝🏼'); -}); - -test('`unicodeProperty` nesting matching', () => { - expect(u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space')))).toMatchString( - 'a', - ); - expect(u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space')))).toMatchString( - ' ', - ); - expect( - u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space'))), - ).not.toMatchString('A'); -}); diff --git a/src/constructs/char-class.ts b/src/constructs/char-class.ts index b2bc758..0da9375 100644 --- a/src/constructs/char-class.ts +++ b/src/constructs/char-class.ts @@ -32,7 +32,7 @@ export function charRange(start: string, end: string): CharacterClass { } export function anyOf(characters: string): CharacterClass { - const chars = characters.split('').map((c) => escapeForCharacterClass(c)); + const chars = characters.split('').map((c) => escapeCharClass(c)); if (chars.length === 0) { throw new Error('`anyOf` should received at least one character'); @@ -52,6 +52,55 @@ export function negated(element: CharacterClass | CharacterEscape): EncodedRegex */ export const inverted = negated; -function escapeForCharacterClass(text: string): string { +function escapeCharClass(text: string): string { return text.replace(/[\]\\]/g, '\\$&'); // $& means the whole matched string } + +/** + * Unicode character escape. + * + * Regex pattern: + * - `\uXXXX`: 4-digit hex escape for code points below 0x10000. + * - `\u{X}`: Unicode code point escape for code points above 0xFFFF. + * + * Note: for code points above 0xFFFF, the regex engine must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). + * + * @param codePoint The code point of the character to escape. + * @returns A character class representing the unicode escape. + */ +export function charCode(codePoint: number): CharacterEscape { + if (!Number.isInteger(codePoint) || codePoint < 0 || codePoint > 0x10ffff) { + throw new RangeError(`Expected a valid unicode code point but received ${codePoint}`); + } + + let escape = + codePoint < 0x10000 + ? `\\u${codePoint.toString(16).padStart(4, '0')}` // 4-digit hex (works in all modes) + : `\\u{${codePoint.toString(16)}}`; // 1-6 digit hex (requires unicode-aware mode) + + return { + precedence: 'atom', + pattern: escape, + chars: [escape], + }; +} + +/** + * Unicode character class escape matching a set of characters specified by a Unicode property. + * + * Regex pattern: `\p{Property}` or `\p{Property=Value}` + * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape + * + * @param property Unicode property name. + * @param value Unicode property value (optional). + * @returns A character class representing the unicode property escape. + */ +export function charProperty(property: string, value?: string): CharacterEscape { + const escape = `\\p{${property}${value ? `=${value}` : ''}}`; + + return { + precedence: 'atom', + pattern: escape, + chars: [escape], + }; +} diff --git a/src/constructs/char-escape.ts b/src/constructs/char-escape.ts index 77aa2cb..a9dea32 100644 --- a/src/constructs/char-escape.ts +++ b/src/constructs/char-escape.ts @@ -59,3 +59,54 @@ export const notWord = nonWord; * @deprecated Renamed to `nonWhitespace`. */ export const notWhitespace = nonWhitespace; + +/** + * Unicode character code point escape. + * + * Regex pattern: + * - `\uXXXX`: 4-digit hex escape for code points below 0x10000. + * - `\u{X}`: Unicode code point escape for code points above 0xFFFF. + * + * Note: for code points above 0xFFFF, the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). + * + * @param codePoint The code point of the character to escape. + * @returns A character class representing the unicode escape. + */ +export function char(codePoint: number): CharacterEscape { + if (!Number.isInteger(codePoint) || codePoint < 0 || codePoint > 0x10ffff) { + throw new RangeError(`Expected a valid unicode code point but received ${codePoint}`); + } + + let escape = + codePoint < 0x10000 + ? `\\u${codePoint.toString(16).padStart(4, '0')}` // 4-digit hex (works in all modes) + : `\\u{${codePoint.toString(16)}}`; // 1-6 digit hex (requires unicode-aware mode) + + return { + precedence: 'atom', + pattern: escape, + chars: [escape], + }; +} + +/** + * Unicode character property name escape matching a set of characters specified by a Unicode property. + * + * Regex pattern: `\p{Property}` or `\p{Property=Value}` + * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape + * + * Note: the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). + * + * @param property Unicode property name. + * @param value Unicode property value (optional). + * @returns A character class representing the unicode property escape. + */ +export function charProperty(property: string, value?: string): CharacterEscape { + const escape = `\\p{${property}${value ? `=${value}` : ''}}`; + + return { + precedence: 'atom', + pattern: escape, + chars: [escape], + }; +} diff --git a/src/constructs/unicode.ts b/src/constructs/unicode.ts deleted file mode 100644 index d381e47..0000000 --- a/src/constructs/unicode.ts +++ /dev/null @@ -1,77 +0,0 @@ -import type { CharacterEscape } from '../types'; - -/** - * Unicode character escape. - * - * Regex pattern: - * - `\uXXXX`: 4-digit hex escape for code points below 0x10000. - * - `\u{X}`: Unicode code point escape for code points above 0xFFFF. - * - * Note: for code points above 0xFFFF, the regex engine must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). - * - * @param codePoint The code point of the character to escape. - * @returns A character class representing the unicode escape. - */ -export function unicodeChar(codePoint: number): CharacterEscape { - if (!Number.isInteger(codePoint) || codePoint < 0 || codePoint > 0x10ffff) { - throw new RangeError(`Expected a valid unicode code point but received ${codePoint}`); - } - - let escape = - codePoint < 0x10000 - ? `\\u${codePoint.toString(16).padStart(4, '0')}` // 4-digit hex (works in all modes) - : `\\u{${codePoint.toString(16)}}`; // 1-6 digit hex (requires unicode-aware mode) - - return { - precedence: 'atom', - pattern: escape, - chars: [escape], - }; -} - -/** - * Unicode character class escape matching a set of characters specified by a Unicode property. - * - * Regex pattern: `\p{Property}` or `\p{Property=Value}` - * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape - * - * @param property Unicode property name. - * @param value Unicode property value (optional). - * @returns A character class representing the unicode property escape. - */ -export function unicodeProperty(property: string, value?: string): CharacterEscape { - const escape = `\\p{${property}${value ? `=${value}` : ''}}`; - - return { - precedence: 'atom', - pattern: escape, - chars: [escape], - }; -} - -/** - * Unicode character class escape matching a set of characters not specified by a Unicode property. - * - * Regex pattern: `\P{Property}` or `\P{Property=Value}` - * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape - * - * @param property Unicode property name. - * @param value Unicode property value (optional). - * @returns A character class representing the complement of the unicode property escape. - */ -export function unicodePropertyComplement(property: string, value?: string): CharacterEscape { - const escape = `\\P{${property}${value ? `=${value}` : ''}}`; - - return { - precedence: 'atom', - pattern: escape, - chars: [escape], - }; -} - -const unicodeModeRegex = /(?:\\u|\\p|\\P)\{.+?\}/; - -export function getFirstUnicodeAwarePattern(pattern: string): string | null { - const match = pattern.match(unicodeModeRegex); - return match?.[0] ?? null; -} diff --git a/src/index.ts b/src/index.ts index 4ef5c5c..623779f 100644 --- a/src/index.ts +++ b/src/index.ts @@ -5,36 +5,37 @@ export type { QuantifierOptions } from './constructs/quantifiers'; export type { RepeatOptions } from './constructs/repeat'; // Builders -export { buildPattern, buildRegExp } from './builders'; +export { buildRegExp, buildPattern } from './builders'; // Constructs export { + startOfString, endOfString, + wordBoundary, nonWordBoundary, notWordBoundary, - startOfString, - wordBoundary, } from './constructs/anchors'; export { capture, ref } from './constructs/capture'; -export { anyOf, charClass, charRange, negated, inverted } from './constructs/char-class'; +export { charClass, charRange, anyOf, negated, inverted } from './constructs/char-class'; export { any, digit, nonDigit, - nonWhitespace, + word, nonWord, + whitespace, + nonWhitespace, notDigit, notWhitespace, notWord, - whitespace, - word, + char, + charProperty, } from './constructs/char-escape'; export { choiceOf } from './constructs/choice-of'; export { lookahead } from './constructs/lookahead'; export { lookbehind } from './constructs/lookbehind'; export { negativeLookahead } from './constructs/negative-lookahead'; export { negativeLookbehind } from './constructs/negative-lookbehind'; -export { oneOrMore, optional, zeroOrMore } from './constructs/quantifiers'; +export { zeroOrMore, oneOrMore, optional } from './constructs/quantifiers'; export { regex } from './constructs/regex'; export { repeat } from './constructs/repeat'; -export { unicodeChar, unicodeProperty } from './constructs/unicode'; diff --git a/src/types.ts b/src/types.ts index 5436499..81e23a3 100644 --- a/src/types.ts +++ b/src/types.ts @@ -88,8 +88,8 @@ export interface RegexFlags { * Enables [Unicode-aware mode](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). * * This enables features like: - * - unicode character escapes: `\u{xxxx}` - * - unicode character class escapes:`\p{UnicodePropertyValue}` + * - Unicode character escapes: `\u{xxxx}` + * - Unicode character property escapes:`\p{Property=Value}` * * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode */ diff --git a/website/docs/api/unicode.md b/website/docs/api/unicode.md index b1f3386..78a8804 100644 --- a/website/docs/api/unicode.md +++ b/website/docs/api/unicode.md @@ -7,10 +7,10 @@ title: Unicode JavaScript `RegExp` object offers [Unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). -### Unicode character escapes +### Character escapes ```ts -function unicodeChar(): CharacterEscape; +function charCode(codePoint: number): CharacterEscape; ``` Regex syntax: @@ -20,14 +20,15 @@ Regex syntax: Note: for code points above 0xFFFF, the regex engine must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). -### Unicode character class escapes +### Unicode character property escapes ```ts -function unicodeProperty(property: string, value?: string): CharacterEscape; +function charProperty(property: string, value?: string): CharacterEscape; ``` -Unicode character class escape matching a set of characters specified by a Unicode property. +Unicode character property escape matching a set of characters specified by a Unicode property. Regex syntax: `\p{Property}` or `\p{Property=Value}` @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape +@see https://www.unicode.org/reports/tr18/ From 8a389c69e9caa36edb6caa012acb91a6e129ae1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Jastrze=CC=A8bski?= Date: Fri, 6 Sep 2024 22:35:49 +0200 Subject: [PATCH 08/13] refactor: tweaks --- src/__tests__/builder.test.ts | 8 +- .../__tests__/char-escape-unicode.test.tsx | 88 ++++++++++--------- src/constructs/char-escape.ts | 4 +- src/index.ts | 2 +- 4 files changed, 54 insertions(+), 48 deletions(-) diff --git a/src/__tests__/builder.test.ts b/src/__tests__/builder.test.ts index baf9150..7bb2a6d 100644 --- a/src/__tests__/builder.test.ts +++ b/src/__tests__/builder.test.ts @@ -1,4 +1,4 @@ -import { buildRegExp, char, charProperty } from '..'; +import { buildRegExp, char, unicodeProperty } from '..'; test('`regexBuilder` flags', () => { expect(buildRegExp('a').flags).toBe(''); @@ -36,7 +36,7 @@ test('`regexBuilder` flags', () => { test('`regexBuilder` throws when using unicode-aware features without `unicode` flag', () => { expect(() => buildRegExp(char(0x1234))).not.toThrow(); expect(() => buildRegExp(char(0x12345), { unicode: true })).not.toThrow(); - expect(() => buildRegExp(charProperty('Emoji_Presentation'), { unicode: true })).not.toThrow(); + expect(() => buildRegExp(unicodeProperty('Emoji_Presentation'), { unicode: true })).not.toThrow(); expect(() => buildRegExp(char(0x123456))).toThrowErrorMatchingInlineSnapshot( `"Expected a valid unicode code point but received 1193046"`, @@ -44,7 +44,9 @@ test('`regexBuilder` throws when using unicode-aware features without `unicode` expect(() => buildRegExp(char(0x12345))).toThrowErrorMatchingInlineSnapshot( `"The pattern "\\u{12345}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`, ); - expect(() => buildRegExp(charProperty('Emoji_Presentation'))).toThrowErrorMatchingInlineSnapshot( + expect(() => + buildRegExp(unicodeProperty('Emoji_Presentation')), + ).toThrowErrorMatchingInlineSnapshot( `"The pattern "\\p{Emoji_Presentation}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`, ); expect(() => buildRegExp(/\P{Letter}/u)).toThrowErrorMatchingInlineSnapshot( diff --git a/src/constructs/__tests__/char-escape-unicode.test.tsx b/src/constructs/__tests__/char-escape-unicode.test.tsx index b79e7cb..e7c940e 100644 --- a/src/constructs/__tests__/char-escape-unicode.test.tsx +++ b/src/constructs/__tests__/char-escape-unicode.test.tsx @@ -2,10 +2,10 @@ import { buildRegExp, char, charClass, - charProperty, endOfString, type RegexSequence, startOfString, + unicodeProperty, } from '../..'; function u(sequence: RegexSequence) { @@ -89,62 +89,66 @@ test('`char` edge cases handling', () => { expect(u(char(0x10ffff))).toEqualRegex(/\u{10ffff}/u); }); -test('`charProperty` pattern', () => { - expect(u(charProperty('General_Category', 'Letter'))).toEqualRegex( +test('`unicodeProperty` pattern', () => { + expect(u(unicodeProperty('General_Category', 'Letter'))).toEqualRegex( /\p{General_Category=Letter}/u, ); - expect(u(charProperty('Letter'))).toEqualRegex(/\p{Letter}/u); - expect(u(charProperty('L'))).toEqualRegex(/\p{L}/u); - expect(u(charProperty('Lu'))).toEqualRegex(/\p{Lu}/u); - expect(u(charProperty('Ll'))).toEqualRegex(/\p{Ll}/u); - expect(u(charProperty('Lt'))).toEqualRegex(/\p{Lt}/u); - expect(u(charProperty('Lm'))).toEqualRegex(/\p{Lm}/u); - expect(u(charProperty('Lo'))).toEqualRegex(/\p{Lo}/u); - - expect(u(charProperty('Script', 'Latin'))).toEqualRegex('\\p{Script=Latin}'); - expect(u(charProperty('Script', 'Grek'))).toEqualRegex('\\p{Script=Grek}'); - expect(u(charProperty('sc', 'Cyrillic'))).toEqualRegex('\\p{sc=Cyrillic}'); - - expect(u(charProperty('Script', 'Thaana'))).toEqualRegex('\\p{Script=Thaana}'); - expect(u(charProperty('Script_Extensions', 'Thaana'))).toEqualRegex( + expect(u(unicodeProperty('Letter'))).toEqualRegex(/\p{Letter}/u); + expect(u(unicodeProperty('L'))).toEqualRegex(/\p{L}/u); + expect(u(unicodeProperty('Lu'))).toEqualRegex(/\p{Lu}/u); + expect(u(unicodeProperty('Ll'))).toEqualRegex(/\p{Ll}/u); + expect(u(unicodeProperty('Lt'))).toEqualRegex(/\p{Lt}/u); + expect(u(unicodeProperty('Lm'))).toEqualRegex(/\p{Lm}/u); + expect(u(unicodeProperty('Lo'))).toEqualRegex(/\p{Lo}/u); + + expect(u(unicodeProperty('Script', 'Latin'))).toEqualRegex('\\p{Script=Latin}'); + expect(u(unicodeProperty('Script', 'Grek'))).toEqualRegex('\\p{Script=Grek}'); + expect(u(unicodeProperty('sc', 'Cyrillic'))).toEqualRegex('\\p{sc=Cyrillic}'); + + expect(u(unicodeProperty('Script', 'Thaana'))).toEqualRegex('\\p{Script=Thaana}'); + expect(u(unicodeProperty('Script_Extensions', 'Thaana'))).toEqualRegex( '\\p{Script_Extensions=Thaana}', ); - expect(u(charProperty('scx', 'Thaana'))).toEqualRegex('\\p{scx=Thaana}'); + expect(u(unicodeProperty('scx', 'Thaana'))).toEqualRegex('\\p{scx=Thaana}'); - expect(u(charProperty('Emoji'))).toEqualRegex('\\p{Emoji}'); + expect(u(unicodeProperty('Emoji'))).toEqualRegex('\\p{Emoji}'); }); -test('`charProperty` matching', () => { - expect(u(charProperty('General_Category', 'Letter'))).toMatchString('A'); - expect(u(charProperty('Letter'))).toMatchString('A'); - expect(u(charProperty('L'))).toMatchString('A'); +test('`unicodeProperty` matching', () => { + expect(u(unicodeProperty('General_Category', 'Letter'))).toMatchString('A'); + expect(u(unicodeProperty('Letter'))).toMatchString('A'); + expect(u(unicodeProperty('L'))).toMatchString('A'); - expect(u(charProperty('Uppercase'))).toMatchString('A'); - expect(u(charProperty('Uppercase'))).not.toMatchString('a'); - expect(u(charProperty('Lu'))).toMatchString('A'); + expect(u(unicodeProperty('Uppercase'))).toMatchString('A'); + expect(u(unicodeProperty('Uppercase'))).not.toMatchString('a'); + expect(u(unicodeProperty('Lu'))).toMatchString('A'); - expect(u(charProperty('Lowercase'))).toMatchString('a'); - expect(u(charProperty('Lowercase'))).not.toMatchString('A'); - expect(u(charProperty('Ll'))).toMatchString('a'); + expect(u(unicodeProperty('Lowercase'))).toMatchString('a'); + expect(u(unicodeProperty('Lowercase'))).not.toMatchString('A'); + expect(u(unicodeProperty('Ll'))).toMatchString('a'); - expect(u(charProperty('Script', 'Latin'))).toMatchString('A'); - expect(u(charProperty('Script', 'Latin'))).not.toMatchString('α'); - expect(u(charProperty('Script', 'Grek'))).toMatchString('α'); - expect(u(charProperty('Script', 'Grek'))).not.toMatchString('A'); + expect(u(unicodeProperty('Script', 'Latin'))).toMatchString('A'); + expect(u(unicodeProperty('Script', 'Latin'))).not.toMatchString('α'); + expect(u(unicodeProperty('Script', 'Grek'))).toMatchString('α'); + expect(u(unicodeProperty('Script', 'Grek'))).not.toMatchString('A'); // Basic emoji - expect(u([startOfString, charProperty('Emoji'), endOfString])).toMatchString('😎'); - expect(u([startOfString, charProperty('Emoji'), endOfString])).toMatchString('🐌'); + expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).toMatchString('😎'); + expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).toMatchString('🐌'); // Complex emoji with skin tone modifier - expect(u(charProperty('Emoji'))).toMatchString('☝🏼'); - expect(u([startOfString, charProperty('Emoji'), endOfString])).not.toMatchString('☝🏼'); + expect(u(unicodeProperty('Emoji'))).toMatchString('☝🏼'); + expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).not.toMatchString('☝🏼'); }); -test('`charProperty` nesting matching', () => { - expect(u(charClass(charProperty('Lowercase'), charProperty('White_Space')))).toMatchString('a'); - expect(u(charClass(charProperty('Lowercase'), charProperty('White_Space')))).toMatchString(' '); - expect(u(charClass(charProperty('Lowercase'), charProperty('White_Space')))).not.toMatchString( - 'A', +test('`unicodeProperty` nesting matching', () => { + expect(u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space')))).toMatchString( + 'a', ); + expect(u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space')))).toMatchString( + ' ', + ); + expect( + u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space'))), + ).not.toMatchString('A'); }); diff --git a/src/constructs/char-escape.ts b/src/constructs/char-escape.ts index a9dea32..fcf6be5 100644 --- a/src/constructs/char-escape.ts +++ b/src/constructs/char-escape.ts @@ -90,7 +90,7 @@ export function char(codePoint: number): CharacterEscape { } /** - * Unicode character property name escape matching a set of characters specified by a Unicode property. + * Unicode property escape matching a set of characters specified by a Unicode property. * * Regex pattern: `\p{Property}` or `\p{Property=Value}` * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape @@ -101,7 +101,7 @@ export function char(codePoint: number): CharacterEscape { * @param value Unicode property value (optional). * @returns A character class representing the unicode property escape. */ -export function charProperty(property: string, value?: string): CharacterEscape { +export function unicodeProperty(property: string, value?: string): CharacterEscape { const escape = `\\p{${property}${value ? `=${value}` : ''}}`; return { diff --git a/src/index.ts b/src/index.ts index 623779f..30d6677 100644 --- a/src/index.ts +++ b/src/index.ts @@ -29,7 +29,7 @@ export { notWhitespace, notWord, char, - charProperty, + unicodeProperty, } from './constructs/char-escape'; export { choiceOf } from './constructs/choice-of'; export { lookahead } from './constructs/lookahead'; From 67ea4bb85e514adc9413b169ada614cbcc033653 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Jastrze=CC=A8bski?= Date: Fri, 6 Sep 2024 22:38:55 +0200 Subject: [PATCH 09/13] chore: tweaks --- README.md | 27 +++++++++++++++------------ website/docs/api/unicode.md | 2 +- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 1b90147..b0ba843 100644 --- a/README.md +++ b/README.md @@ -127,18 +127,21 @@ See [Quantifiers API doc](https://callstack.github.io/ts-regex-builder/api/quant ### Character classes -| Character class | Regex Syntax | Description | -| --------------------- | ------------ | ------------------------------------------------- | -| `any` | `.` | Any character | -| `word` | `\w` | Word character: letter, digit, underscore | -| `digit` | `\d` | Digit character: 0 to 9 | -| `whitespace` | `\s` | Whitespace character: space, tab, line break, ... | -| `anyOf('abc')` | `[abc]` | Any of provided characters | -| `charRange('a', 'z')` | `[a-z]` | Character in a range | -| `charClass(...)` | `[...]` | Union of multiple character classes | -| `negated(...)` | `[^...]` | Negation of a given character class | - -See [Character Classes API doc](https://callstack.github.io/ts-regex-builder/api/character-classes) for more info. +| Character class | Regex Syntax | Description | +| ---------------------- | ------------ | ------------------------------------------------- | +| `any` | `.` | Any character | +| `word` | `\w` | Word character: letter, digit, underscore | +| `digit` | `\d` | Digit character: 0 to 9 | +| `whitespace` | `\s` | Whitespace character: space, tab, line break, ... | +| `anyOf('abc')` | `[abc]` | Any of provided characters | +| `charRange('a', 'z')` | `[a-z]` | Character in a range | +| `charClass(...)` | `[...]` | Union of multiple character classes | +| `negated(...)` | `[^...]` | Negation of a given character class | +| `char(...)` | `\uXXXX` | Character specified given Unicode code point | +| `unicodeProperty(...)` | `\p{...}` | Characters with given Unicode property | + + +See [Character Classes API doc](https://callstack.github.io/ts-regex-builder/api/character-classes) and [Unicode API doc](https://callstack.github.io/ts-regex-builder/api/unicode) for more info. ### Assertions diff --git a/website/docs/api/unicode.md b/website/docs/api/unicode.md index 78a8804..2c6ffc3 100644 --- a/website/docs/api/unicode.md +++ b/website/docs/api/unicode.md @@ -1,5 +1,5 @@ --- -id: character-classes +id: unicode title: Unicode --- From d64eb1f5cb4ed8c9a8ed5134763d6beacc6d8fdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Jastrze=CC=A8bski?= Date: Fri, 6 Sep 2024 22:42:55 +0200 Subject: [PATCH 10/13] chore: docs tweaks --- website/docs/api/unicode.md | 5 +++-- website/sidebars.js | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/website/docs/api/unicode.md b/website/docs/api/unicode.md index 2c6ffc3..93c3991 100644 --- a/website/docs/api/unicode.md +++ b/website/docs/api/unicode.md @@ -30,5 +30,6 @@ Unicode character property escape matching a set of characters specified by a Un Regex syntax: `\p{Property}` or `\p{Property=Value}` -@see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape -@see https://www.unicode.org/reports/tr18/ +See: +- [MDN: Unicode character class escape](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape) +- [UTS#18: Unicode Regular Expressions](https://www.unicode.org/reports/tr18/) diff --git a/website/sidebars.js b/website/sidebars.js index ed97527..ebb4d7b 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -33,6 +33,7 @@ export default { 'api/quantifiers', 'api/character-classes', 'api/assertions', + 'api/unicode', ], }, { From 555e2a46f96befc7b3a79f7aa17b718247afc6c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Jastrze=CC=A8bski?= Date: Fri, 6 Sep 2024 22:50:02 +0200 Subject: [PATCH 11/13] chore: tweak docs --- website/docs/api/overview.md | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/website/docs/api/overview.md b/website/docs/api/overview.md index 6f5f71f..ab6031e 100644 --- a/website/docs/api/overview.md +++ b/website/docs/api/overview.md @@ -75,18 +75,20 @@ See [Quantifiers](./api/quantifiers) for more info. ### Character classes -| Character class | Regex Syntax | Description | -| --------------------- | ------------ | ------------------------------------------------- | -| `any` | `.` | Any character | -| `word` | `\w` | Word character: letter, digit, underscore | -| `digit` | `\d` | Digit character: 0 to 9 | -| `whitespace` | `\s` | Whitespace character: space, tab, line break, ... | -| `anyOf('abc')` | `[abc]` | Any of provided characters | -| `charRange('a', 'z')` | `[a-z]` | Character in a range | -| `charClass(...)` | `[...]` | Union of multiple character classes | -| `negated(...)` | `[^...]` | Negation of a given character class | - -See [Character Classes](./api/character-classes) for more info. +| Character class | Regex Syntax | Description | +| ---------------------- | ------------ | ------------------------------------------------- | +| `any` | `.` | Any character | +| `word` | `\w` | Word character: letter, digit, underscore | +| `digit` | `\d` | Digit character: 0 to 9 | +| `whitespace` | `\s` | Whitespace character: space, tab, line break, ... | +| `anyOf('abc')` | `[abc]` | Any of provided characters | +| `charRange('a', 'z')` | `[a-z]` | Character in a range | +| `charClass(...)` | `[...]` | Union of multiple character classes | +| `negated(...)` | `[^...]` | Negation of a given character class | +| `char(...)` | `\uXXXX` | Character specified given Unicode code point | +| `unicodeProperty(...)` | `\p{...}` | Characters with given Unicode property | + +See [Character Classes](./api/character-classes) and [Unicode](./api/unicode) for more info. ### Assertions From 7f7781185fa5a157b3a59e32414df91b0ae3ee8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Jastrze=CC=A8bski?= Date: Fri, 6 Sep 2024 22:54:29 +0200 Subject: [PATCH 12/13] refactor: remove duplicate code --- src/constructs/char-class.ts | 49 ------------------------------------ 1 file changed, 49 deletions(-) diff --git a/src/constructs/char-class.ts b/src/constructs/char-class.ts index 0da9375..c480d9f 100644 --- a/src/constructs/char-class.ts +++ b/src/constructs/char-class.ts @@ -55,52 +55,3 @@ export const inverted = negated; function escapeCharClass(text: string): string { return text.replace(/[\]\\]/g, '\\$&'); // $& means the whole matched string } - -/** - * Unicode character escape. - * - * Regex pattern: - * - `\uXXXX`: 4-digit hex escape for code points below 0x10000. - * - `\u{X}`: Unicode code point escape for code points above 0xFFFF. - * - * Note: for code points above 0xFFFF, the regex engine must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). - * - * @param codePoint The code point of the character to escape. - * @returns A character class representing the unicode escape. - */ -export function charCode(codePoint: number): CharacterEscape { - if (!Number.isInteger(codePoint) || codePoint < 0 || codePoint > 0x10ffff) { - throw new RangeError(`Expected a valid unicode code point but received ${codePoint}`); - } - - let escape = - codePoint < 0x10000 - ? `\\u${codePoint.toString(16).padStart(4, '0')}` // 4-digit hex (works in all modes) - : `\\u{${codePoint.toString(16)}}`; // 1-6 digit hex (requires unicode-aware mode) - - return { - precedence: 'atom', - pattern: escape, - chars: [escape], - }; -} - -/** - * Unicode character class escape matching a set of characters specified by a Unicode property. - * - * Regex pattern: `\p{Property}` or `\p{Property=Value}` - * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape - * - * @param property Unicode property name. - * @param value Unicode property value (optional). - * @returns A character class representing the unicode property escape. - */ -export function charProperty(property: string, value?: string): CharacterEscape { - const escape = `\\p{${property}${value ? `=${value}` : ''}}`; - - return { - precedence: 'atom', - pattern: escape, - chars: [escape], - }; -} From 6f53395d8c454fb9eb8d0c3d4b47e69751f0f856 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Jastrze=CC=A8bski?= Date: Fri, 6 Sep 2024 23:14:46 +0200 Subject: [PATCH 13/13] docs: tweaks --- website/docs/api/unicode.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/api/unicode.md b/website/docs/api/unicode.md index 93c3991..fc1648b 100644 --- a/website/docs/api/unicode.md +++ b/website/docs/api/unicode.md @@ -10,7 +10,7 @@ JavaScript `RegExp` object offers [Unicode-aware](https://developer.mozilla.org/ ### Character escapes ```ts -function charCode(codePoint: number): CharacterEscape; +function char(codePoint: number): CharacterEscape; ``` Regex syntax: @@ -23,7 +23,7 @@ Note: for code points above 0xFFFF, the regex engine must be [unicode-aware](htt ### Unicode character property escapes ```ts -function charProperty(property: string, value?: string): CharacterEscape; +function unicodeProperty(property: string, value?: string): CharacterEscape; ``` Unicode character property escape matching a set of characters specified by a Unicode property.