Skip to content

Pre-compute unicode category list for xclasses #647

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 112 additions & 6 deletions src/pcre2_compile_class.c
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,11 @@ while (TRUE)

#ifdef SUPPORT_UNICODE

#define PARSE_CLASS_UTF 0x1
#define PARSE_CLASS_CASELESS_UTF 0x2
#define PARSE_CLASS_RESTRICTED_UTF 0x4
#define PARSE_CLASS_TURKISH_UTF 0x8
#define PARSE_CLASS_UTF 0x01
#define PARSE_CLASS_CASELESS_UTF 0x02
#define PARSE_CLASS_RESTRICTED_UTF 0x04
#define PARSE_CLASS_TURKISH_UTF 0x08
#define PARSE_CLASS_COMPUTE_CATLIST 0x10

/* Get the range of nocase characters which includes the
'c' character passed as argument, or directly follows 'c'. */
Expand Down Expand Up @@ -357,13 +358,28 @@ append_non_ascii_range(uint32_t options, uint32_t *buffer)
return buffer + 2;
}

/* The buffer may represent the categry list pointer when utf is enabled. */
static size_t
parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer)
{
size_t total_size = 0;
size_t size;
uint32_t meta_arg;
uint32_t start_char;
uint32_t ptype;
#ifdef SUPPORT_UNICODE
uint32_t pdata;
uint32_t category_list;
uint32_t *pcategory_list = NULL;
#endif

#ifdef SUPPORT_UNICODE
if ((options & PARSE_CLASS_COMPUTE_CATLIST) != 0)
{
pcategory_list = buffer;
buffer = NULL;
}
#endif

while (TRUE)
{
Expand Down Expand Up @@ -407,7 +423,8 @@ while (TRUE)
case ESC_p:
case ESC_P:
ptr++;
if (meta_arg == ESC_p && (*ptr >> 16) == PT_ANY)
ptype = (*ptr >> 16);
if (meta_arg == ESC_p && ptype == PT_ANY)
{
if (buffer != NULL)
{
Expand All @@ -417,6 +434,43 @@ while (TRUE)
}
total_size += 2;
}
#ifdef SUPPORT_UNICODE
if (pcategory_list == NULL) break;

category_list = 0;

switch(ptype)
{
case PT_LAMP:
category_list = UCPCAT3(ucp_Lu, ucp_Ll, ucp_Lt);
break;

case PT_GC:
pdata = *ptr & 0xffff;
category_list = UCPCAT_RANGE(PRIV(ucp_typerange)[pdata],
PRIV(ucp_typerange)[pdata + 1] - 1);
break;

case PT_PC:
pdata = *ptr & 0xffff;
category_list = UCPCAT(pdata);
break;

case PT_WORD:
category_list = UCPCAT2(ucp_Mn, ucp_Pc) | UCPCAT_L | UCPCAT_N;
break;

case PT_ALNUM:
category_list = UCPCAT_L | UCPCAT_N;
break;
}

if (category_list > 0)
{
if (meta_arg == ESC_P) category_list ^= UCPCAT_ALL;
*pcategory_list |= category_list;
}
#endif
break;
}
ptr++;
Expand Down Expand Up @@ -511,6 +565,9 @@ const uint32_t *char_list_next;
uint16_t *next_char;
uint32_t char_list_start, char_list_end;
uint32_t range_start, range_end;
#ifdef SUPPORT_UNICODE
uint32_t category_list = 0;
#endif

#ifdef SUPPORT_UNICODE
if (options & PCRE2_UTF)
Expand All @@ -531,11 +588,22 @@ if (xoptions & PCRE2_EXTRA_TURKISH_CASING)

/* Compute required space for the range. */

#ifdef SUPPORT_UNICODE
range_list_size = parse_class(start_ptr,
class_options | PARSE_CLASS_COMPUTE_CATLIST,
&category_list);
#else
range_list_size = parse_class(start_ptr, class_options, NULL);
#endif
PCRE2_ASSERT((range_list_size & 0x1) == 0);

/* Allocate buffer. The total_size also represents the end of the buffer. */

#ifdef SUPPORT_UNICODE
/* Replaced by an OP_ALLANY. */
if (category_list == UCPCAT_ALL) range_list_size = 2;
#endif

total_size = range_list_size +
((range_list_size >= 2) ? CHAR_LIST_EXTRA_SIZE : 0);

Expand All @@ -553,6 +621,21 @@ cranges->range_list_size = (uint16_t)range_list_size;
cranges->char_lists_types = 0;
cranges->char_lists_size = 0;
cranges->char_lists_start = 0;
#ifdef SUPPORT_UNICODE
cranges->category_list = category_list;
#endif

#ifdef SUPPORT_UNICODE
if (category_list == UCPCAT_ALL)
{
/* Replace the xclass with OP_ALLANY. */
cranges->category_list = 0;
buffer = (uint32_t*)(cranges + 1);
buffer[0] = 0;
buffer[1] = get_highest_char(class_options);
return cranges;
}
#endif

if (range_list_size == 0) return cranges;

Expand Down Expand Up @@ -1087,6 +1170,7 @@ BOOL utf = FALSE;

#ifdef SUPPORT_WIDE_CHARS
uint32_t xclass_props;
uint32_t category_list;
PCRE2_UCHAR *class_uchardata;
class_ranges* cranges;
#else
Expand All @@ -1107,6 +1191,7 @@ should_flip_negation = FALSE;

#ifdef SUPPORT_WIDE_CHARS
xclass_props = 0;
category_list = 0;

#if PCRE2_CODE_UNIT_WIDTH == 8
cranges = NULL;
Expand Down Expand Up @@ -1140,6 +1225,9 @@ if (utf)
cb->first_data = cranges->header.next;
}

category_list = cranges->category_list;
PCRE2_ASSERT(category_list != UCPCAT_ALL);

if (cranges->range_list_size > 0)
{
const uint32_t *ranges = (const uint32_t*)(cranges + 1);
Expand All @@ -1154,6 +1242,13 @@ if (utf)
}

class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */

if (cranges != NULL && category_list != 0 &&
(xclass_props & XCLASS_HIGH_ANY) == 0)
{
xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;
class_uchardata += sizeof(uint32_t) / sizeof(PCRE2_UCHAR);
}
#endif /* SUPPORT_WIDE_CHARS */

/* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
Expand Down Expand Up @@ -1444,7 +1539,9 @@ while (TRUE)

PRIV(update_classbits)(ptype, pdata, (escape == ESC_P), classbits);

if ((xclass_props & XCLASS_HIGH_ANY) == 0)
if ((xclass_props & XCLASS_HIGH_ANY) == 0 &&
ptype != PT_LAMP && ptype != PT_GC && ptype != PT_PC &&
ptype != PT_WORD && ptype != PT_ALNUM)
{
if (lengthptr != NULL)
*lengthptr += 3;
Expand Down Expand Up @@ -1709,6 +1806,15 @@ if ((xclass_props & XCLASS_REQUIRED) != 0)
*code = negate_class? XCL_NOT:0;
if ((xclass_props & XCLASS_HAS_PROPS) != 0) *code |= XCL_HASPROP;

/* The category_list is placed after the class feature bitset.
The code pointer is not increased, because the bitset for the
first 256 characters may be injected after the feature bitset. */
if (category_list != 0)
{
*code |= XCL_HASCATLIST;
memmove(code + 1, &category_list, sizeof(uint32_t));
}

/* If the map is required, move up the extra data to make room for it;
otherwise just move the code pointer to the end of the extra data. */

Expand Down
19 changes: 16 additions & 3 deletions src/pcre2_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1515,9 +1515,10 @@ table. */
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
contain characters with values greater than 255. */

#define XCL_NOT 0x01 /* Flag: this is a negative class */
#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
#define XCL_HASPROP 0x04 /* Flag: property checks are present. */
#define XCL_NOT 0x01 /* Flag: this is a negative class */
#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
#define XCL_HASPROP 0x04 /* Flag: property checks are present */
#define XCL_HASCATLIST 0x08 /* Flag: category list is present */

#define XCL_END 0 /* Marks end of individual items */
#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
Expand Down Expand Up @@ -2189,6 +2190,18 @@ typedef struct {
((uint32_t)(ch) == 0x0130u ? 0x69u : \
(uint32_t)(ch) == 0x49u ? 0x0131u : (uint32_t)(ch))

/* UCP bitset manipulating macros. */

#ifdef SUPPORT_UNICODE
#define UCPCAT(bit) (1 << (bit))
#define UCPCAT2(bit1, bit2) (UCPCAT(bit1) | UCPCAT(bit2))
#define UCPCAT3(bit1, bit2, bit3) (UCPCAT(bit1) | UCPCAT(bit2) | UCPCAT(bit3))
#define UCPCAT_RANGE(start, end) (((1 << ((end) + 1)) - 1) - ((1 << (start)) - 1))
#define UCPCAT_L UCPCAT_RANGE(ucp_Ll, ucp_Lu)
#define UCPCAT_N UCPCAT_RANGE(ucp_Nd, ucp_No)
#define UCPCAT_ALL ((1 << (ucp_Zs + 1)) - 1)
#endif

/* The "scriptx" and bprops fields contain offsets into vectors of 32-bit words
that form a bitmap representing a list of scripts or boolean properties. These
macros test or set a bit in the map by number. */
Expand Down
3 changes: 3 additions & 0 deletions src/pcre2_intmodedep.h
Original file line number Diff line number Diff line change
Expand Up @@ -754,6 +754,9 @@ typedef struct class_ranges {
compile_data header; /* Common header */
size_t char_lists_size; /* Total size of encoded char lists */
size_t char_lists_start; /* Start offset of encoded char lists */
#ifdef SUPPORT_UNICODE
uint32_t category_list; /* Bitset of matching unicode categories. */
#endif
uint16_t range_list_size; /* Size of ranges array */
uint16_t char_lists_types; /* The XCL_LIST header of char lists */
/* Followed by the list of ranges (start/end pairs) */
Expand Down
30 changes: 14 additions & 16 deletions src/pcre2_jit_char_inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,13 @@ if (flags & XCL_MAP)
cc += 32 / sizeof(PCRE2_UCHAR);

#ifdef SUPPORT_UNICODE
if (flags & XCL_HASCATLIST)
{
memcpy(&category_list, cc, sizeof(uint32_t));
status |= XCLASS_HAS_TYPE;
cc += sizeof(uint32_t) / sizeof(PCRE2_UCHAR);
}

while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
{
compares++;
Expand All @@ -542,12 +549,14 @@ while (*cc == XCL_PROP || *cc == XCL_NOTPROP)

switch(*cc)
{
/* JIT compiles bare (not in class) escape sequences using
this code path, so setting categories must be kept. */
case PT_LAMP:
items = UCPCAT3(ucp_Lu, ucp_Ll, ucp_Lt);
break;

case PT_GC:
items = UCPCAT_RANGE(PRIV(ucp_typerange)[(int)cc[1] * 2], PRIV(ucp_typerange)[(int)cc[1] * 2 + 1]);
items = UCPCAT_RANGE(PRIV(ucp_typerange)[(int)cc[1]], PRIV(ucp_typerange)[(int)cc[1] + 1] - 1);
break;

case PT_PC:
Expand Down Expand Up @@ -614,21 +623,7 @@ while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
cc += 2;
}

if (category_list == UCPCAT_ALL)
{
/* All or no characters are accepted, same as dotall. */
if (status & XCLASS_IS_ECLASS)
{
if (list != backtracks)
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
return;
}

compile_char1_matchingpath(common, OP_ALLANY, cc, backtracks, FALSE);
if (list == backtracks)
add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
return;
}
SLJIT_ASSERT(category_list != UCPCAT_ALL);

if (category_list != 0)
compares++;
Expand Down Expand Up @@ -681,6 +676,9 @@ if ((flags & XCL_MAP) != 0)
}

#ifdef SUPPORT_UNICODE
if (flags & XCL_HASCATLIST)
cc += sizeof(uint32_t) / sizeof(PCRE2_UCHAR);

if (status & XCLASS_NEEDS_UCD)
{
if ((status & (XCLASS_SAVE_CHAR | XCLASS_IS_ECLASS)) == XCLASS_SAVE_CHAR)
Expand Down
10 changes: 0 additions & 10 deletions src/pcre2_jit_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -7147,16 +7147,6 @@ else
JUMPTO(SLJIT_JUMP, mainloop);
}

#ifdef SUPPORT_UNICODE
#define UCPCAT(bit) (1 << (bit))
#define UCPCAT2(bit1, bit2) (UCPCAT(bit1) | UCPCAT(bit2))
#define UCPCAT3(bit1, bit2, bit3) (UCPCAT(bit1) | UCPCAT(bit2) | UCPCAT(bit3))
#define UCPCAT_RANGE(start, end) (((1 << ((end) + 1)) - 1) - ((1 << (start)) - 1))
#define UCPCAT_L UCPCAT_RANGE(ucp_Ll, ucp_Lu)
#define UCPCAT_N UCPCAT_RANGE(ucp_Nd, ucp_No)
#define UCPCAT_ALL ((1 << (ucp_Zs + 1)) - 1)
#endif

static void check_wordboundary(compiler_common *common, BOOL ucp)
{
DEFINE_COMPILER;
Expand Down
Loading
Loading