Skip to content

Commit c9b1d96

Browse files
committed
Improve gen-unicode.py to support unicode ranges
This patch fixes #2936 Co-authored-by: Gabor Loki [email protected] JerryScript-DCO-1.0-Signed-off-by: Robert Fancsik [email protected]
1 parent 8766bb3 commit c9b1d96

File tree

3 files changed

+146
-80
lines changed

3 files changed

+146
-80
lines changed

jerry-core/lit/lit-unicode-ranges.inc.h

Lines changed: 48 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,29 @@ static const uint16_t lit_unicode_letter_interval_sps[] JERRY_ATTR_CONST_DATA =
4747
0x2145, 0x2160, 0x2c00, 0x2c30, 0x2c60, 0x2ceb, 0x2cf2, 0x2d00, 0x2d30, 0x2d80,
4848
0x2da0, 0x2da8, 0x2db0, 0x2db8, 0x2dc0, 0x2dc8, 0x2dd0, 0x2dd8, 0x3005, 0x3021,
4949
0x3031, 0x3038, 0x3041, 0x309d, 0x30a1, 0x30fc, 0x3105, 0x3131, 0x31a0, 0x31f0,
50-
0xa000, 0xa100, 0xa200, 0xa300, 0xa400, 0xa4d0, 0xa500, 0xa600, 0xa610, 0xa62a,
51-
0xa640, 0xa67f, 0xa6a0, 0xa717, 0xa722, 0xa78b, 0xa7b0, 0xa7f7, 0xa803, 0xa807,
52-
0xa80c, 0xa840, 0xa882, 0xa8f2, 0xa90a, 0xa930, 0xa960, 0xa984, 0xa9e0, 0xa9e6,
53-
0xa9fa, 0xaa00, 0xaa40, 0xaa44, 0xaa60, 0xaa7e, 0xaab5, 0xaab9, 0xaadb, 0xaae0,
54-
0xaaf2, 0xab01, 0xab09, 0xab11, 0xab20, 0xab28, 0xab30, 0xab5c, 0xab70, 0xd7b0,
55-
0xd7cb, 0xf900, 0xfa00, 0xfa70, 0xfb00, 0xfb13, 0xfb1f, 0xfb2a, 0xfb38, 0xfb40,
56-
0xfb43, 0xfb46, 0xfbd3, 0xfcd3, 0xfd50, 0xfd92, 0xfdf0, 0xfe70, 0xfe76, 0xff21,
57-
0xff41, 0xff66, 0xffc2, 0xffca, 0xffd2, 0xffda
50+
0x3400, 0x3500, 0x3600, 0x3700, 0x3800, 0x3900, 0x3a00, 0x3b00, 0x3c00, 0x3d00,
51+
0x3e00, 0x3f00, 0x4000, 0x4100, 0x4200, 0x4300, 0x4400, 0x4500, 0x4600, 0x4700,
52+
0x4800, 0x4900, 0x4a00, 0x4b00, 0x4c00, 0x4d00, 0x4e00, 0x4f00, 0x5000, 0x5100,
53+
0x5200, 0x5300, 0x5400, 0x5500, 0x5600, 0x5700, 0x5800, 0x5900, 0x5a00, 0x5b00,
54+
0x5c00, 0x5d00, 0x5e00, 0x5f00, 0x6000, 0x6100, 0x6200, 0x6300, 0x6400, 0x6500,
55+
0x6600, 0x6700, 0x6800, 0x6900, 0x6a00, 0x6b00, 0x6c00, 0x6d00, 0x6e00, 0x6f00,
56+
0x7000, 0x7100, 0x7200, 0x7300, 0x7400, 0x7500, 0x7600, 0x7700, 0x7800, 0x7900,
57+
0x7a00, 0x7b00, 0x7c00, 0x7d00, 0x7e00, 0x7f00, 0x8000, 0x8100, 0x8200, 0x8300,
58+
0x8400, 0x8500, 0x8600, 0x8700, 0x8800, 0x8900, 0x8a00, 0x8b00, 0x8c00, 0x8d00,
59+
0x8e00, 0x8f00, 0x9000, 0x9100, 0x9200, 0x9300, 0x9400, 0x9500, 0x9600, 0x9700,
60+
0x9800, 0x9900, 0x9a00, 0x9b00, 0x9c00, 0x9d00, 0x9e00, 0x9f00, 0xa000, 0xa100,
61+
0xa200, 0xa300, 0xa400, 0xa4d0, 0xa500, 0xa600, 0xa610, 0xa62a, 0xa640, 0xa67f,
62+
0xa6a0, 0xa717, 0xa722, 0xa78b, 0xa7b0, 0xa7f7, 0xa803, 0xa807, 0xa80c, 0xa840,
63+
0xa882, 0xa8f2, 0xa90a, 0xa930, 0xa960, 0xa984, 0xa9e0, 0xa9e6, 0xa9fa, 0xaa00,
64+
0xaa40, 0xaa44, 0xaa60, 0xaa7e, 0xaab5, 0xaab9, 0xaadb, 0xaae0, 0xaaf2, 0xab01,
65+
0xab09, 0xab11, 0xab20, 0xab28, 0xab30, 0xab5c, 0xab70, 0xac00, 0xad00, 0xae00,
66+
0xaf00, 0xb000, 0xb100, 0xb200, 0xb300, 0xb400, 0xb500, 0xb600, 0xb700, 0xb800,
67+
0xb900, 0xba00, 0xbb00, 0xbc00, 0xbd00, 0xbe00, 0xbf00, 0xc000, 0xc100, 0xc200,
68+
0xc300, 0xc400, 0xc500, 0xc600, 0xc700, 0xc800, 0xc900, 0xca00, 0xcb00, 0xcc00,
69+
0xcd00, 0xce00, 0xcf00, 0xd000, 0xd100, 0xd200, 0xd300, 0xd400, 0xd500, 0xd600,
70+
0xd700, 0xd7b0, 0xd7cb, 0xf900, 0xfa00, 0xfa70, 0xfb00, 0xfb13, 0xfb1f, 0xfb2a,
71+
0xfb38, 0xfb40, 0xfb43, 0xfb46, 0xfbd3, 0xfcd3, 0xfd50, 0xfd92, 0xfdf0, 0xfe70,
72+
0xfe76, 0xff21, 0xff41, 0xff66, 0xffc2, 0xffca, 0xffd2, 0xffda
5873
};
5974

6075
/**
@@ -88,14 +103,29 @@ static const uint8_t lit_unicode_letter_interval_lengths[] JERRY_ATTR_CONST_DATA
88103
0x0004, 0x0028, 0x002e, 0x002e, 0x0084, 0x0003, 0x0001, 0x0025, 0x0037, 0x0016,
89104
0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0002, 0x0008,
90105
0x0004, 0x0004, 0x0055, 0x0002, 0x0059, 0x0003, 0x0028, 0x005d, 0x001a, 0x000f,
91-
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x008c, 0x002d, 0x00ff, 0x000c, 0x000f, 0x0001,
92-
0x002e, 0x001e, 0x004f, 0x0008, 0x0066, 0x0023, 0x0007, 0x000a, 0x0002, 0x0003,
93-
0x0016, 0x0033, 0x0031, 0x0005, 0x001b, 0x0016, 0x001c, 0x002e, 0x0004, 0x0009,
94-
0x0004, 0x0028, 0x0002, 0x0007, 0x0016, 0x0031, 0x0001, 0x0004, 0x0002, 0x000a,
95-
0x0002, 0x0005, 0x0005, 0x0005, 0x0006, 0x0006, 0x002a, 0x0009, 0x0072, 0x0016,
96-
0x0030, 0x00ff, 0x006d, 0x0069, 0x0006, 0x0004, 0x0009, 0x000c, 0x0004, 0x0001,
97-
0x0001, 0x006b, 0x00ff, 0x006a, 0x003f, 0x0035, 0x000b, 0x0004, 0x0086, 0x0019,
98-
0x0019, 0x0058, 0x0005, 0x0005, 0x0005, 0x0002
106+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
107+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
108+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00b5, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
109+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
110+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
111+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
112+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
113+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
114+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
115+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
116+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00d5, 0x00ff, 0x00ff,
117+
0x00ff, 0x00ff, 0x008c, 0x002d, 0x00ff, 0x000c, 0x000f, 0x0001, 0x002e, 0x001e,
118+
0x004f, 0x0008, 0x0066, 0x0023, 0x0007, 0x000a, 0x0002, 0x0003, 0x0016, 0x0033,
119+
0x0031, 0x0005, 0x001b, 0x0016, 0x001c, 0x002e, 0x0004, 0x0009, 0x0004, 0x0028,
120+
0x0002, 0x0007, 0x0016, 0x0031, 0x0001, 0x0004, 0x0002, 0x000a, 0x0002, 0x0005,
121+
0x0005, 0x0005, 0x0006, 0x0006, 0x002a, 0x0009, 0x0072, 0x00ff, 0x00ff, 0x00ff,
122+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
123+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
124+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
125+
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
126+
0x00a3, 0x0016, 0x0030, 0x00ff, 0x006d, 0x0069, 0x0006, 0x0004, 0x0009, 0x000c,
127+
0x0004, 0x0001, 0x0001, 0x006b, 0x00ff, 0x006a, 0x003f, 0x0035, 0x000b, 0x0004,
128+
0x0086, 0x0019, 0x0019, 0x0058, 0x0005, 0x0005, 0x0005, 0x0002
99129
};
100130

101131
/**
@@ -114,8 +144,8 @@ static const uint16_t lit_unicode_letter_chars[] JERRY_ATTR_CONST_DATA =
114144
0x0e8d, 0x0ea5, 0x0ea7, 0x0ebd, 0x0ec6, 0x0f00, 0x103f, 0x1061, 0x108e, 0x10c7,
115145
0x10cd, 0x1258, 0x12c0, 0x17d7, 0x17dc, 0x18aa, 0x1aa7, 0x1f59, 0x1f5b, 0x1f5d,
116146
0x1fbe, 0x2071, 0x207f, 0x2102, 0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x214e,
117-
0x2d27, 0x2d2d, 0x2d6f, 0x2e2f, 0x3400, 0x4db5, 0x4e00, 0x9fd5, 0xa8fb, 0xa8fd,
118-
0xa9cf, 0xaa7a, 0xaab1, 0xaac0, 0xaac2, 0xac00, 0xd7a3, 0xfb1d, 0xfb3e
147+
0x2d27, 0x2d2d, 0x2d6f, 0x2e2f, 0x3400, 0x4e00, 0xa8fb, 0xa8fd, 0xa9cf, 0xaa7a,
148+
0xaab1, 0xaac0, 0xaac2, 0xac00, 0xfb1d, 0xfb3e
119149
};
120150

121151
/**
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// Copyright JS Foundation and other contributors, http://js.foundation
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
var 测试 = "您好";
16+
assert(测试 === "您好");

tools/gen-unicode.py

Lines changed: 82 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -57,67 +57,86 @@ def generate(self):
5757
generated_source.write("\n".join(self.__header))
5858
generated_source.write("\n".join(self.__data))
5959

60-
61-
# functions for unicode ranges
62-
63-
64-
def read_categories(unicode_data_file):
65-
"""
66-
Read the corresponding unicode values and store them in category lists.
67-
68-
:return: List of letters, non_letter and separators.
69-
"""
70-
71-
# unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
72-
# letter: Lu Ll Lt Lm Lo Nl
73-
# non-letter-indent-part:
74-
# digit: Nd
75-
# punctuation mark: Mn Mc
76-
# connector punctuation: Pc
77-
# separators: Zs
78-
letter_category = ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"]
79-
non_letter_category = ["Nd", "Mn", "Mc", "Pc"]
80-
separator_category = ["Zs"]
81-
82-
letters = []
83-
non_letters = []
84-
separators = []
85-
86-
with open(unicode_data_file) as unicode_data:
87-
for line in csv.reader(unicode_data, delimiter=';'):
88-
unicode_id = int(line[0], 16)
89-
90-
# Skip supplementary planes and ascii chars
91-
if unicode_id >= 0x10000 or unicode_id < 128:
92-
continue
93-
94-
category = line[2]
95-
96-
if category in letter_category:
97-
letters.append(unicode_id)
98-
elif category in non_letter_category:
99-
non_letters.append(unicode_id)
100-
elif category in separator_category:
101-
separators.append(unicode_id)
102-
103-
# This separator char is handled separatly
104-
non_breaking_space = 0x00A0
105-
if non_breaking_space in separators:
106-
separators.remove(int(non_breaking_space))
107-
108-
# These separator chars are not in the unicode data file or not in Zs category
109-
mongolian_vowel_separator = 0x180E
110-
medium_mathematical_space = 0x205F
111-
zero_width_space = 0x200B
112-
113-
if mongolian_vowel_separator not in separators:
114-
bisect.insort(separators, int(mongolian_vowel_separator))
115-
if medium_mathematical_space not in separators:
116-
bisect.insort(separators, int(medium_mathematical_space))
117-
if zero_width_space not in separators:
118-
bisect.insort(separators, int(zero_width_space))
119-
120-
return letters, non_letters, separators
60+
class UnicodeCategorizer(object):
61+
def __init__(self):
62+
# unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
63+
# letter: Lu Ll Lt Lm Lo Nl
64+
# non-letter-indent-part:
65+
# digit: Nd
66+
# punctuation mark: Mn Mc
67+
# connector punctuation: Pc
68+
# separators: Zs
69+
self._unicode_categories = {
70+
'letters_category' : ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"],
71+
'non_letters_category' : ["Nd", "Mn", "Mc", "Pc"],
72+
'separators_category' : ["Zs"]
73+
}
74+
75+
self._categories = {
76+
'letters' : [],
77+
'non_letters' : [],
78+
'separators' : []
79+
}
80+
81+
def _store_by_category(self, unicode_id, category):
82+
"""
83+
Store the given unicode_id by its category
84+
"""
85+
for target_category in self._categories.keys():
86+
if category in self._unicode_categories[target_category + '_category']:
87+
self._categories[target_category].append(unicode_id)
88+
89+
def read_categories(self, unicode_data_file):
90+
"""
91+
Read the corresponding unicode values and store them in category lists.
92+
93+
:return: List of letters, non_letter and separators.
94+
"""
95+
96+
range_start_id = 0
97+
98+
with open(unicode_data_file) as unicode_data:
99+
for line in csv.reader(unicode_data, delimiter=';'):
100+
unicode_id = int(line[0], 16)
101+
102+
# Skip supplementary planes and ascii chars
103+
if unicode_id >= 0x10000 or unicode_id < 128:
104+
continue
105+
106+
category = line[2]
107+
108+
if range_start_id != 0:
109+
while range_start_id <= unicode_id:
110+
self._store_by_category(range_start_id, category)
111+
range_start_id += 1
112+
range_start_id = 0
113+
continue
114+
115+
if line[1].startswith('<'):
116+
# Save the start position of the range
117+
range_start_id = unicode_id
118+
119+
self._store_by_category(unicode_id, category)
120+
121+
# This separator char is handled separatly
122+
separators = self._categories['separators']
123+
non_breaking_space = 0x00A0
124+
if non_breaking_space in separators:
125+
separators.remove(int(non_breaking_space))
126+
127+
# These separator chars are not in the unicode data file or not in Zs category
128+
mongolian_vowel_separator = 0x180E
129+
medium_mathematical_space = 0x205F
130+
zero_width_space = 0x200B
131+
132+
if mongolian_vowel_separator not in separators:
133+
bisect.insort(separators, int(mongolian_vowel_separator))
134+
if medium_mathematical_space not in separators:
135+
bisect.insort(separators, int(medium_mathematical_space))
136+
if zero_width_space not in separators:
137+
bisect.insort(separators, int(zero_width_space))
138+
139+
return self._categories['letters'], self._categories['non_letters'], self._categories['separators']
121140

122141

123142
def group_ranges(i):
@@ -159,7 +178,8 @@ def split_list(category_list):
159178

160179

161180
def generate_ranges(script_args):
162-
letters, non_letters, separators = read_categories(script_args.unicode_data)
181+
categorizer = UnicodeCategorizer();
182+
letters, non_letters, separators = categorizer.read_categories(script_args.unicode_data)
163183

164184
letter_tables = split_list(list(group_ranges(letters)))
165185
non_letter_tables = split_list(list(group_ranges(non_letters)))

0 commit comments

Comments
 (0)