Improve gen-unicode.py to support unicode ranges

rerobika · rerobika · commit c9b1d9633724 · 2019-07-05T11:06:49.000+02:00
This patch fixes #2936 Co-authored-by: Gabor Loki loki@inf.u-szeged.hu JerryScript-DCO-1.0-Signed-off-by: Robert Fancsik frobert@inf.u-szeged.hu
diff --git a/jerry-core/lit/lit-unicode-ranges.inc.h b/jerry-core/lit/lit-unicode-ranges.inc.h
@@ -47,14 +47,29 @@ static const uint16_t lit_unicode_letter_interval_sps[] JERRY_ATTR_CONST_DATA =
   0x2145, 0x2160, 0x2c00, 0x2c30, 0x2c60, 0x2ceb, 0x2cf2, 0x2d00, 0x2d30, 0x2d80,
   0x2da0, 0x2da8, 0x2db0, 0x2db8, 0x2dc0, 0x2dc8, 0x2dd0, 0x2dd8, 0x3005, 0x3021,
   0x3031, 0x3038, 0x3041, 0x309d, 0x30a1, 0x30fc, 0x3105, 0x3131, 0x31a0, 0x31f0,
-  0xa000, 0xa100, 0xa200, 0xa300, 0xa400, 0xa4d0, 0xa500, 0xa600, 0xa610, 0xa62a,
-  0xa640, 0xa67f, 0xa6a0, 0xa717, 0xa722, 0xa78b, 0xa7b0, 0xa7f7, 0xa803, 0xa807,
-  0xa80c, 0xa840, 0xa882, 0xa8f2, 0xa90a, 0xa930, 0xa960, 0xa984, 0xa9e0, 0xa9e6,
-  0xa9fa, 0xaa00, 0xaa40, 0xaa44, 0xaa60, 0xaa7e, 0xaab5, 0xaab9, 0xaadb, 0xaae0,
-  0xaaf2, 0xab01, 0xab09, 0xab11, 0xab20, 0xab28, 0xab30, 0xab5c, 0xab70, 0xd7b0,
-  0xd7cb, 0xf900, 0xfa00, 0xfa70, 0xfb00, 0xfb13, 0xfb1f, 0xfb2a, 0xfb38, 0xfb40,
-  0xfb43, 0xfb46, 0xfbd3, 0xfcd3, 0xfd50, 0xfd92, 0xfdf0, 0xfe70, 0xfe76, 0xff21,
-  0xff41, 0xff66, 0xffc2, 0xffca, 0xffd2, 0xffda
+  0x3400, 0x3500, 0x3600, 0x3700, 0x3800, 0x3900, 0x3a00, 0x3b00, 0x3c00, 0x3d00,
+  0x3e00, 0x3f00, 0x4000, 0x4100, 0x4200, 0x4300, 0x4400, 0x4500, 0x4600, 0x4700,
+  0x4800, 0x4900, 0x4a00, 0x4b00, 0x4c00, 0x4d00, 0x4e00, 0x4f00, 0x5000, 0x5100,
+  0x5200, 0x5300, 0x5400, 0x5500, 0x5600, 0x5700, 0x5800, 0x5900, 0x5a00, 0x5b00,
+  0x5c00, 0x5d00, 0x5e00, 0x5f00, 0x6000, 0x6100, 0x6200, 0x6300, 0x6400, 0x6500,
+  0x6600, 0x6700, 0x6800, 0x6900, 0x6a00, 0x6b00, 0x6c00, 0x6d00, 0x6e00, 0x6f00,
+  0x7000, 0x7100, 0x7200, 0x7300, 0x7400, 0x7500, 0x7600, 0x7700, 0x7800, 0x7900,
+  0x7a00, 0x7b00, 0x7c00, 0x7d00, 0x7e00, 0x7f00, 0x8000, 0x8100, 0x8200, 0x8300,
+  0x8400, 0x8500, 0x8600, 0x8700, 0x8800, 0x8900, 0x8a00, 0x8b00, 0x8c00, 0x8d00,
+  0x8e00, 0x8f00, 0x9000, 0x9100, 0x9200, 0x9300, 0x9400, 0x9500, 0x9600, 0x9700,
+  0x9800, 0x9900, 0x9a00, 0x9b00, 0x9c00, 0x9d00, 0x9e00, 0x9f00, 0xa000, 0xa100,
+  0xa200, 0xa300, 0xa400, 0xa4d0, 0xa500, 0xa600, 0xa610, 0xa62a, 0xa640, 0xa67f,
+  0xa6a0, 0xa717, 0xa722, 0xa78b, 0xa7b0, 0xa7f7, 0xa803, 0xa807, 0xa80c, 0xa840,
+  0xa882, 0xa8f2, 0xa90a, 0xa930, 0xa960, 0xa984, 0xa9e0, 0xa9e6, 0xa9fa, 0xaa00,
+  0xaa40, 0xaa44, 0xaa60, 0xaa7e, 0xaab5, 0xaab9, 0xaadb, 0xaae0, 0xaaf2, 0xab01,
+  0xab09, 0xab11, 0xab20, 0xab28, 0xab30, 0xab5c, 0xab70, 0xac00, 0xad00, 0xae00,
+  0xaf00, 0xb000, 0xb100, 0xb200, 0xb300, 0xb400, 0xb500, 0xb600, 0xb700, 0xb800,
+  0xb900, 0xba00, 0xbb00, 0xbc00, 0xbd00, 0xbe00, 0xbf00, 0xc000, 0xc100, 0xc200,
+  0xc300, 0xc400, 0xc500, 0xc600, 0xc700, 0xc800, 0xc900, 0xca00, 0xcb00, 0xcc00,
+  0xcd00, 0xce00, 0xcf00, 0xd000, 0xd100, 0xd200, 0xd300, 0xd400, 0xd500, 0xd600,
+  0xd700, 0xd7b0, 0xd7cb, 0xf900, 0xfa00, 0xfa70, 0xfb00, 0xfb13, 0xfb1f, 0xfb2a,
+  0xfb38, 0xfb40, 0xfb43, 0xfb46, 0xfbd3, 0xfcd3, 0xfd50, 0xfd92, 0xfdf0, 0xfe70,
+  0xfe76, 0xff21, 0xff41, 0xff66, 0xffc2, 0xffca, 0xffd2, 0xffda
 };
 
 /**
@@ -88,14 +103,29 @@ static const uint8_t lit_unicode_letter_interval_lengths[] JERRY_ATTR_CONST_DATA
   0x0004, 0x0028, 0x002e, 0x002e, 0x0084, 0x0003, 0x0001, 0x0025, 0x0037, 0x0016,
   0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0002, 0x0008,
   0x0004, 0x0004, 0x0055, 0x0002, 0x0059, 0x0003, 0x0028, 0x005d, 0x001a, 0x000f,
-  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x008c, 0x002d, 0x00ff, 0x000c, 0x000f, 0x0001,
-  0x002e, 0x001e, 0x004f, 0x0008, 0x0066, 0x0023, 0x0007, 0x000a, 0x0002, 0x0003,
-  0x0016, 0x0033, 0x0031, 0x0005, 0x001b, 0x0016, 0x001c, 0x002e, 0x0004, 0x0009,
-  0x0004, 0x0028, 0x0002, 0x0007, 0x0016, 0x0031, 0x0001, 0x0004, 0x0002, 0x000a,
-  0x0002, 0x0005, 0x0005, 0x0005, 0x0006, 0x0006, 0x002a, 0x0009, 0x0072, 0x0016,
-  0x0030, 0x00ff, 0x006d, 0x0069, 0x0006, 0x0004, 0x0009, 0x000c, 0x0004, 0x0001,
-  0x0001, 0x006b, 0x00ff, 0x006a, 0x003f, 0x0035, 0x000b, 0x0004, 0x0086, 0x0019,
-  0x0019, 0x0058, 0x0005, 0x0005, 0x0005, 0x0002
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00b5, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00d5, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x008c, 0x002d, 0x00ff, 0x000c, 0x000f, 0x0001, 0x002e, 0x001e,
+  0x004f, 0x0008, 0x0066, 0x0023, 0x0007, 0x000a, 0x0002, 0x0003, 0x0016, 0x0033,
+  0x0031, 0x0005, 0x001b, 0x0016, 0x001c, 0x002e, 0x0004, 0x0009, 0x0004, 0x0028,
+  0x0002, 0x0007, 0x0016, 0x0031, 0x0001, 0x0004, 0x0002, 0x000a, 0x0002, 0x0005,
+  0x0005, 0x0005, 0x0006, 0x0006, 0x002a, 0x0009, 0x0072, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
+  0x00a3, 0x0016, 0x0030, 0x00ff, 0x006d, 0x0069, 0x0006, 0x0004, 0x0009, 0x000c,
+  0x0004, 0x0001, 0x0001, 0x006b, 0x00ff, 0x006a, 0x003f, 0x0035, 0x000b, 0x0004,
+  0x0086, 0x0019, 0x0019, 0x0058, 0x0005, 0x0005, 0x0005, 0x0002
 };
 
 /**
@@ -114,8 +144,8 @@ static const uint16_t lit_unicode_letter_chars[] JERRY_ATTR_CONST_DATA =
   0x0e8d, 0x0ea5, 0x0ea7, 0x0ebd, 0x0ec6, 0x0f00, 0x103f, 0x1061, 0x108e, 0x10c7,
   0x10cd, 0x1258, 0x12c0, 0x17d7, 0x17dc, 0x18aa, 0x1aa7, 0x1f59, 0x1f5b, 0x1f5d,
   0x1fbe, 0x2071, 0x207f, 0x2102, 0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x214e,
-  0x2d27, 0x2d2d, 0x2d6f, 0x2e2f, 0x3400, 0x4db5, 0x4e00, 0x9fd5, 0xa8fb, 0xa8fd,
-  0xa9cf, 0xaa7a, 0xaab1, 0xaac0, 0xaac2, 0xac00, 0xd7a3, 0xfb1d, 0xfb3e
+  0x2d27, 0x2d2d, 0x2d6f, 0x2e2f, 0x3400, 0x4e00, 0xa8fb, 0xa8fd, 0xa9cf, 0xaa7a,
+  0xaab1, 0xaac0, 0xaac2, 0xac00, 0xfb1d, 0xfb3e
 };
 
 /**
diff --git a/tests/jerry/regression-test-issue-2936.js b/tests/jerry/regression-test-issue-2936.js
@@ -0,0 +1,16 @@
+// Copyright JS Foundation and other contributors, http://js.foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+var 测试 = "您好";
+assert(测试 === "您好");
diff --git a/tools/gen-unicode.py b/tools/gen-unicode.py
@@ -57,67 +57,86 @@ def generate(self):
             generated_source.write("\n".join(self.__header))
             generated_source.write("\n".join(self.__data))
 
-
-# functions for unicode ranges
-
-
-def read_categories(unicode_data_file):
-    """
-    Read the corresponding unicode values and store them in category lists.
-
-    :return: List of letters, non_letter and separators.
-    """
-
-    # unicode categories:      Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
-    # letter:                  Lu Ll Lt Lm Lo Nl
-    # non-letter-indent-part:
-    #   digit:                 Nd
-    #   punctuation mark:      Mn Mc
-    #   connector punctuation: Pc
-    # separators:              Zs
-    letter_category = ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"]
-    non_letter_category = ["Nd", "Mn", "Mc", "Pc"]
-    separator_category = ["Zs"]
-
-    letters = []
-    non_letters = []
-    separators = []
-
-    with open(unicode_data_file) as unicode_data:
-        for line in csv.reader(unicode_data, delimiter=';'):
-            unicode_id = int(line[0], 16)
-
-            # Skip supplementary planes and ascii chars
-            if unicode_id >= 0x10000 or unicode_id < 128:
-                continue
-
-            category = line[2]
-
-            if category in letter_category:
-                letters.append(unicode_id)
-            elif category in non_letter_category:
-                non_letters.append(unicode_id)
-            elif category in separator_category:
-                separators.append(unicode_id)
-
-    # This separator char is handled separatly
-    non_breaking_space = 0x00A0
-    if non_breaking_space in separators:
-        separators.remove(int(non_breaking_space))
-
-    # These separator chars are not in the unicode data file or not in Zs category
-    mongolian_vowel_separator = 0x180E
-    medium_mathematical_space = 0x205F
-    zero_width_space = 0x200B
-
-    if mongolian_vowel_separator not in separators:
-        bisect.insort(separators, int(mongolian_vowel_separator))
-    if medium_mathematical_space not in separators:
-        bisect.insort(separators, int(medium_mathematical_space))
-    if zero_width_space not in separators:
-        bisect.insort(separators, int(zero_width_space))
-
-    return letters, non_letters, separators
+class UnicodeCategorizer(object):
+    def __init__(self):
+        # unicode categories:      Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
+        # letter:                  Lu Ll Lt Lm Lo Nl
+        # non-letter-indent-part:
+        #   digit:                 Nd
+        #   punctuation mark:      Mn Mc
+        #   connector punctuation: Pc
+        # separators:              Zs
+        self._unicode_categories = {
+            'letters_category' : ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"],
+            'non_letters_category' : ["Nd", "Mn", "Mc", "Pc"],
+            'separators_category' : ["Zs"]
+        }
+
+        self._categories = {
+            'letters' : [],
+            'non_letters' : [],
+            'separators' : []
+        }
+
+    def _store_by_category(self, unicode_id, category):
+        """
+        Store the given unicode_id by its category
+        """
+        for target_category in self._categories.keys():
+            if category in self._unicode_categories[target_category + '_category']:
+                self._categories[target_category].append(unicode_id)
+
+    def read_categories(self, unicode_data_file):
+        """
+        Read the corresponding unicode values and store them in category lists.
+
+        :return: List of letters, non_letter and separators.
+        """
+
+        range_start_id = 0
+
+        with open(unicode_data_file) as unicode_data:
+            for line in csv.reader(unicode_data, delimiter=';'):
+                unicode_id = int(line[0], 16)
+
+                # Skip supplementary planes and ascii chars
+                if unicode_id >= 0x10000 or unicode_id < 128:
+                    continue
+
+                category = line[2]
+
+                if range_start_id != 0:
+                    while range_start_id <= unicode_id:
+                        self._store_by_category(range_start_id, category)
+                        range_start_id += 1
+                    range_start_id = 0
+                    continue
+
+                if line[1].startswith('<'):
+                    # Save the start position of the range
+                    range_start_id = unicode_id
+
+                self._store_by_category(unicode_id, category)
+
+        # This separator char is handled separatly
+        separators = self._categories['separators']
+        non_breaking_space = 0x00A0
+        if non_breaking_space in separators:
+            separators.remove(int(non_breaking_space))
+
+        # These separator chars are not in the unicode data file or not in Zs category
+        mongolian_vowel_separator = 0x180E
+        medium_mathematical_space = 0x205F
+        zero_width_space = 0x200B
+
+        if mongolian_vowel_separator not in separators:
+            bisect.insort(separators, int(mongolian_vowel_separator))
+        if medium_mathematical_space not in separators:
+            bisect.insort(separators, int(medium_mathematical_space))
+        if zero_width_space not in separators:
+            bisect.insort(separators, int(zero_width_space))
+
+        return self._categories['letters'], self._categories['non_letters'], self._categories['separators']
 
 
 def group_ranges(i):
@@ -159,7 +178,8 @@ def split_list(category_list):
 
 
 def generate_ranges(script_args):
-    letters, non_letters, separators = read_categories(script_args.unicode_data)
+    categorizer = UnicodeCategorizer();
+    letters, non_letters, separators = categorizer.read_categories(script_args.unicode_data)
 
     letter_tables = split_list(list(group_ranges(letters)))
     non_letter_tables = split_list(list(group_ranges(non_letters)))