Skip to content

Commit 6df6a6a

Browse files
committed
Improve gen-unicode.py to support unicode ranges
This patch fixes #2936 Co-authored-by: Gabor Loki [email protected] JerryScript-DCO-1.0-Signed-off-by: Robert Fancsik [email protected]
1 parent 8766bb3 commit 6df6a6a

File tree

3 files changed

+147
-80
lines changed

3 files changed

+147
-80
lines changed

jerry-core/lit/lit-unicode-ranges.inc.h

Lines changed: 48 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,29 @@ static const uint16_t lit_unicode_letter_interval_sps[] JERRY_ATTR_CONST_DATA =
4747
0x2145, 0x2160, 0x2c00, 0x2c30, 0x2c60, 0x2ceb, 0x2cf2, 0x2d00, 0x2d30, 0x2d80,
4848
0x2da0, 0x2da8, 0x2db0, 0x2db8, 0x2dc0, 0x2dc8, 0x2dd0, 0x2dd8, 0x3005, 0x3021,
4949
0x3031, 0x3038, 0x3041, 0x309d, 0x30a1, 0x30fc, 0x3105, 0x3131, 0x31a0, 0x31f0,
50-
0xa000, 0xa100, 0xa200, 0xa300, 0xa400, 0xa4d0, 0xa500, 0xa600, 0xa610, 0xa62a,
51-
0xa640, 0xa67f, 0xa6a0, 0xa717, 0xa722, 0xa78b, 0xa7b0, 0xa7f7, 0xa803, 0xa807,
52-
0xa80c, 0xa840, 0xa882, 0xa8f2, 0xa90a, 0xa930, 0xa960, 0xa984, 0xa9e0, 0xa9e6,
53-
0xa9fa, 0xaa00, 0xaa40, 0xaa44, 0xaa60, 0xaa7e, 0xaab5, 0xaab9, 0xaadb, 0xaae0,
54-
0xaaf2, 0xab01, 0xab09, 0xab11, 0xab20, 0xab28, 0xab30, 0xab5c, 0xab70, 0xd7b0,
55-
0xd7cb, 0xf900, 0xfa00, 0xfa70, 0xfb00, 0xfb13, 0xfb1f, 0xfb2a, 0xfb38, 0xfb40,
56-
0xfb43, 0xfb46, 0xfbd3, 0xfcd3, 0xfd50, 0xfd92, 0xfdf0, 0xfe70, 0xfe76, 0xff21,
57-
0xff41, 0xff66, 0xffc2, 0xffca, 0xffd2, 0xffda
50+
0x3400, 0x3500, 0x3600, 0x3700, 0x3800, 0x3900, 0x3a00, 0x3b00, 0x3c00, 0x3d00,
51+
0x3e00, 0x3f00, 0x4000, 0x4100, 0x4200, 0x4300, 0x4400, 0x4500, 0x4600, 0x4700,
52+
0x4800, 0x4900, 0x4a00, 0x4b00, 0x4c00, 0x4d00, 0x4e00, 0x4f00, 0x5000, 0x5100,
53+
0x5200, 0x5300, 0x5400, 0x5500, 0x5600, 0x5700, 0x5800, 0x5900, 0x5a00, 0x5b00,
54+
0x5c00, 0x5d00, 0x5e00, 0x5f00, 0x6000, 0x6100, 0x6200, 0x6300, 0x6400, 0x6500,
55+
0x6600, 0x6700, 0x6800, 0x6900, 0x6a00, 0x6b00, 0x6c00, 0x6d00, 0x6e00, 0x6f00,
56+
0x7000, 0x7100, 0x7200, 0x7300, 0x7400, 0x7500, 0x7600, 0x7700, 0x7800, 0x7900,
57+
0x7a00, 0x7b00, 0x7c00, 0x7d00, 0x7e00, 0x7f00, 0x8000, 0x8100, 0x8200, 0x8300,
58+
0x8400, 0x8500, 0x8600, 0x8700, 0x8800, 0x8900, 0x8a00, 0x8b00, 0x8c00, 0x8d00,
59+
0x8e00, 0x8f00, 0x9000, 0x9100, 0x9200, 0x9300, 0x9400, 0x9500, 0x9600, 0x9700,
60+
0x9800, 0x9900, 0x9a00, 0x9b00, 0x9c00, 0x9d00, 0x9e00, 0x9f00, 0xa000, 0xa100,
61+
0xa200, 0xa300, 0xa400, 0xa4d0, 0xa500, 0xa600, 0xa610, 0xa62a, 0xa640, 0xa67f,
62+
0xa6a0, 0xa717, 0xa722, 0xa78b, 0xa7b0, 0xa7f7, 0xa803, 0xa807, 0xa80c, 0xa840,
63+
0xa882, 0xa8f2, 0xa90a, 0xa930, 0xa960, 0xa984, 0xa9e0, 0xa9e6, 0xa9fa, 0xaa00,
64+
0xaa40, 0xaa44, 0xaa60, 0xaa7e, 0xaab5, 0xaab9, 0xaadb, 0xaae0, 0xaaf2, 0xab01,
65+
0xab09, 0xab11, 0xab20, 0xab28, 0xab30, 0xab5c, 0xab70, 0xac00, 0xad00, 0xae00,
66+
0xaf00, 0xb000, 0xb100, 0xb200, 0xb300, 0xb400, 0xb500, 0xb600, 0xb700, 0xb800,
67+
0xb900, 0xba00, 0xbb00, 0xbc00, 0xbd00, 0xbe00, 0xbf00, 0xc000, 0xc100, 0xc200,
68+
0xc300, 0xc400, 0xc500, 0xc600, 0xc700, 0xc800, 0xc900, 0xca00, 0xcb00, 0xcc00,
69+
0xcd00, 0xce00, 0xcf00, 0xd000, 0xd100, 0xd200, 0xd300, 0xd400, 0xd500, 0xd600,
70+
0xd700, 0xd7b0, 0xd7cb, 0xf900, 0xfa00, 0xfa70, 0xfb00, 0xfb13, 0xfb1f, 0xfb2a,
71+
0xfb38, 0xfb40, 0xfb43, 0xfb46, 0xfbd3, 0xfcd3, 0xfd50, 0xfd92, 0xfdf0, 0xfe70,
72+
0xfe76, 0xff21, 0xff41, 0xff66, 0xffc2, 0xffca, 0xffd2, 0xffda
5873
};
5974

6075
/**
@@ -88,14 +103,29 @@ static const uint8_t lit_unicode_letter_interval_lengths[] JERRY_ATTR_CONST_DATA
88103
0x0004, 0x0028, 0x002e, 0x002e, 0x0084, 0x0003, 0x0001, 0x0025, 0x0037, 0x0016,
89104
0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0002, 0x0008,
90105
0x0004, 0x0004, 0x0055, 0x0002, 0x0059, 0x0003, 0x0028, 0x005d, 0x001a, 0x000f,
91-
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x008c, 0x002d, 0x00ff, 0x000c, 0x000f, 0x0001,
92-
0x002e, 0x001e, 0x004f, 0x0008, 0x0066, 0x0023, 0x0007, 0x000a, 0x0002, 0x0003,
93-
0x0016, 0x0033, 0x0031, 0x0005, 0x001b, 0x0016, 0x001c, 0x002e, 0x0004, 0x0009,
94-
0x0004, 0x0028, 0x0002, 0x0007, 0x0016, 0x0031, 0x0001, 0x0004, 0x0002, 0x000a,
95-
0x0002, 0x0005, 0x0005, 0x0005, 0x0006, 0x0006, 0x002a, 0x0009, 0x0072, 0x0016,
96-
0x0030, 0x00ff, 0x006d, 0x0069, 0x0006, 0x0004, 0x0009, 0x000c, 0x0004, 0x0001,
97-
0x0001, 0x006b, 0x00ff, 0x006a, 0x003f, 0x0035, 0x000b, 0x0004, 0x0086, 0x0019,
98-
0x0019, 0x0058, 0x0005, 0x0005, 0x0005, 0x0002
106+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
107+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
108+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00b5, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
109+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
110+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
111+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
112+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
113+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
114+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
115+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
116+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00d5, 0x00ff, 0x00ff,
117+
0x00ff, 0x00ff, 0x008c, 0x002d, 0x00ff, 0x000c, 0x000f, 0x0001, 0x002e, 0x001e,
118+
0x004f, 0x0008, 0x0066, 0x0023, 0x0007, 0x000a, 0x0002, 0x0003, 0x0016, 0x0033,
119+
0x0031, 0x0005, 0x001b, 0x0016, 0x001c, 0x002e, 0x0004, 0x0009, 0x0004, 0x0028,
120+
0x0002, 0x0007, 0x0016, 0x0031, 0x0001, 0x0004, 0x0002, 0x000a, 0x0002, 0x0005,
121+
0x0005, 0x0005, 0x0006, 0x0006, 0x002a, 0x0009, 0x0072, 0x00ff, 0x00ff, 0x00ff,
122+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
123+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
124+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
125+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
126+
0x00a3, 0x0016, 0x0030, 0x00ff, 0x006d, 0x0069, 0x0006, 0x0004, 0x0009, 0x000c,
127+
0x0004, 0x0001, 0x0001, 0x006b, 0x00ff, 0x006a, 0x003f, 0x0035, 0x000b, 0x0004,
128+
0x0086, 0x0019, 0x0019, 0x0058, 0x0005, 0x0005, 0x0005, 0x0002
99129
};
100130

101131
/**
@@ -114,8 +144,8 @@ static const uint16_t lit_unicode_letter_chars[] JERRY_ATTR_CONST_DATA =
114144
0x0e8d, 0x0ea5, 0x0ea7, 0x0ebd, 0x0ec6, 0x0f00, 0x103f, 0x1061, 0x108e, 0x10c7,
115145
0x10cd, 0x1258, 0x12c0, 0x17d7, 0x17dc, 0x18aa, 0x1aa7, 0x1f59, 0x1f5b, 0x1f5d,
116146
0x1fbe, 0x2071, 0x207f, 0x2102, 0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x214e,
117-
0x2d27, 0x2d2d, 0x2d6f, 0x2e2f, 0x3400, 0x4db5, 0x4e00, 0x9fd5, 0xa8fb, 0xa8fd,
118-
0xa9cf, 0xaa7a, 0xaab1, 0xaac0, 0xaac2, 0xac00, 0xd7a3, 0xfb1d, 0xfb3e
147+
0x2d27, 0x2d2d, 0x2d6f, 0x2e2f, 0x3400, 0x4e00, 0xa8fb, 0xa8fd, 0xa9cf, 0xaa7a,
148+
0xaab1, 0xaac0, 0xaac2, 0xac00, 0xfb1d, 0xfb3e
119149
};
120150

121151
/**
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// Copyright JS Foundation and other contributors, http://js.foundation
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
var 测试 = "您好";
16+
assert(测试 === "您好");

tools/gen-unicode.py

Lines changed: 83 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -57,67 +57,87 @@ def generate(self):
5757
generated_source.write("\n".join(self.__header))
5858
generated_source.write("\n".join(self.__data))
5959

60-
61-
# functions for unicode ranges
62-
63-
64-
def read_categories(unicode_data_file):
65-
"""
66-
Read the corresponding unicode values and store them in category lists.
67-
68-
:return: List of letters, non_letter and separators.
69-
"""
70-
71-
# unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
72-
# letter: Lu Ll Lt Lm Lo Nl
73-
# non-letter-indent-part:
74-
# digit: Nd
75-
# punctuation mark: Mn Mc
76-
# connector punctuation: Pc
77-
# separators: Zs
78-
letter_category = ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"]
79-
non_letter_category = ["Nd", "Mn", "Mc", "Pc"]
80-
separator_category = ["Zs"]
81-
82-
letters = []
83-
non_letters = []
84-
separators = []
85-
86-
with open(unicode_data_file) as unicode_data:
87-
for line in csv.reader(unicode_data, delimiter=';'):
88-
unicode_id = int(line[0], 16)
89-
90-
# Skip supplementary planes and ascii chars
91-
if unicode_id >= 0x10000 or unicode_id < 128:
92-
continue
93-
94-
category = line[2]
95-
96-
if category in letter_category:
97-
letters.append(unicode_id)
98-
elif category in non_letter_category:
99-
non_letters.append(unicode_id)
100-
elif category in separator_category:
101-
separators.append(unicode_id)
102-
103-
# This separator char is handled separatly
104-
non_breaking_space = 0x00A0
105-
if non_breaking_space in separators:
106-
separators.remove(int(non_breaking_space))
107-
108-
# These separator chars are not in the unicode data file or not in Zs category
109-
mongolian_vowel_separator = 0x180E
110-
medium_mathematical_space = 0x205F
111-
zero_width_space = 0x200B
112-
113-
if mongolian_vowel_separator not in separators:
114-
bisect.insort(separators, int(mongolian_vowel_separator))
115-
if medium_mathematical_space not in separators:
116-
bisect.insort(separators, int(medium_mathematical_space))
117-
if zero_width_space not in separators:
118-
bisect.insort(separators, int(zero_width_space))
119-
120-
return letters, non_letters, separators
60+
class UnicodeCategorizer(object):
61+
def __init__(self):
62+
# unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs
63+
# Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
64+
# letter: Lu Ll Lt Lm Lo Nl
65+
# non-letter-indent-part:
66+
# digit: Nd
67+
# punctuation mark: Mn Mc
68+
# connector punctuation: Pc
69+
# separators: Zs
70+
self._unicode_categories = {
71+
'letters_category' : ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"],
72+
'non_letters_category' : ["Nd", "Mn", "Mc", "Pc"],
73+
'separators_category' : ["Zs"]
74+
}
75+
76+
self._categories = {
77+
'letters' : [],
78+
'non_letters' : [],
79+
'separators' : []
80+
}
81+
82+
def _store_by_category(self, unicode_id, category):
83+
"""
84+
Store the given unicode_id by its category
85+
"""
86+
for target_category in self._categories:
87+
if category in self._unicode_categories[target_category + '_category']:
88+
self._categories[target_category].append(unicode_id)
89+
90+
def read_categories(self, unicode_data_file):
91+
"""
92+
Read the corresponding unicode values and store them in category lists.
93+
94+
:return: List of letters, non_letter and separators.
95+
"""
96+
97+
range_start_id = 0
98+
99+
with open(unicode_data_file) as unicode_data:
100+
for line in csv.reader(unicode_data, delimiter=';'):
101+
unicode_id = int(line[0], 16)
102+
103+
# Skip supplementary planes and ascii chars
104+
if unicode_id >= 0x10000 or unicode_id < 128:
105+
continue
106+
107+
category = line[2]
108+
109+
if range_start_id != 0:
110+
while range_start_id <= unicode_id:
111+
self._store_by_category(range_start_id, category)
112+
range_start_id += 1
113+
range_start_id = 0
114+
continue
115+
116+
if line[1].startswith('<'):
117+
# Save the start position of the range
118+
range_start_id = unicode_id
119+
120+
self._store_by_category(unicode_id, category)
121+
122+
# This separator char is handled separatly
123+
separators = self._categories['separators']
124+
non_breaking_space = 0x00A0
125+
if non_breaking_space in separators:
126+
separators.remove(int(non_breaking_space))
127+
128+
# These separator chars are not in the unicode data file or not in Zs category
129+
mongolian_vowel_separator = 0x180E
130+
medium_mathematical_space = 0x205F
131+
zero_width_space = 0x200B
132+
133+
if mongolian_vowel_separator not in separators:
134+
bisect.insort(separators, int(mongolian_vowel_separator))
135+
if medium_mathematical_space not in separators:
136+
bisect.insort(separators, int(medium_mathematical_space))
137+
if zero_width_space not in separators:
138+
bisect.insort(separators, int(zero_width_space))
139+
140+
return self._categories['letters'], self._categories['non_letters'], self._categories['separators']
121141

122142

123143
def group_ranges(i):
@@ -159,7 +179,8 @@ def split_list(category_list):
159179

160180

161181
def generate_ranges(script_args):
162-
letters, non_letters, separators = read_categories(script_args.unicode_data)
182+
categorizer = UnicodeCategorizer()
183+
letters, non_letters, separators = categorizer.read_categories(script_args.unicode_data)
163184

164185
letter_tables = split_list(list(group_ranges(letters)))
165186
non_letter_tables = split_list(list(group_ranges(non_letters)))

0 commit comments

Comments
 (0)