diff --git a/jerry-core/lit/lit-unicode-conversions.inc.h b/jerry-core/lit/lit-unicode-conversions.inc.h index 1efc1104f6..07955775e6 100644 --- a/jerry-core/lit/lit-unicode-conversions.inc.h +++ b/jerry-core/lit/lit-unicode-conversions.inc.h @@ -13,7 +13,7 @@ * limitations under the License. */ -/* This file is automatically generated by the unicode_case_conversion.py script +/* This file is automatically generated by the gen-unicode.py script * from UnicodeData-9.0.0.txt and SpecialCasing-9.0.0.txt files. Do not edit! */ /* Contains start points of character case ranges (these are bidirectional conversions). */ diff --git a/jerry-core/lit/lit-unicode-ranges.inc.h b/jerry-core/lit/lit-unicode-ranges.inc.h index 5b466d050e..b9f61b449a 100644 --- a/jerry-core/lit/lit-unicode-ranges.inc.h +++ b/jerry-core/lit/lit-unicode-ranges.inc.h @@ -13,7 +13,7 @@ * limitations under the License. */ -/* This file is automatically generated by the unicode_ranges.py script +/* This file is automatically generated by the gen-unicode.py script * from UnicodeData-9.0.0.txt. Do not edit! */ /** diff --git a/tools/unicode_case_conversion.py b/tools/gen-unicode.py similarity index 60% rename from tools/unicode_case_conversion.py rename to tools/gen-unicode.py index 2dd19cc45b..e626717eea 100755 --- a/tools/unicode_case_conversion.py +++ b/tools/gen-unicode.py @@ -17,190 +17,255 @@ from __future__ import print_function import argparse +import bisect import csv import itertools import os -import sys import warnings +from gen_c_source import LICENSE, format_code from settings import PROJECT_DIR -from c_source_helper import UniCodeSource + +RANGES_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges.inc.h') CONVERSIONS_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-conversions.inc.h') -def main(): - parser = argparse.ArgumentParser() +# common code generation - parser.add_argument('--unicode-data', - metavar='FILE', - action='store', - required=True, - help='specify the unicode data file') - parser.add_argument('--special-casing', - metavar='FILE', - action='store', - required=True, - help='specify the special casing file') +class UniCodeSource(object): + def __init__(self, filepath): + self.__filepath = filepath + self.__header = [LICENSE, ""] + self.__data = [] - parser.add_argument('--c-source', - metavar='FILE', - action='store', - default=CONVERSIONS_C_SOURCE, - help='specify the output c source for the conversion tables (default: %(default)s)') + def complete_header(self, completion): + self.__header.append(completion) + self.__header.append("") # for an extra empty line - script_args = parser.parse_args() + def add_table(self, table, table_name, table_type, table_descr): + self.__data.append(table_descr) + self.__data.append("static const %s jerry_%s[] JERRY_CONST_DATA =" % (table_type, table_name)) + self.__data.append("{") + self.__data.append(format_code(table, 1)) + self.__data.append("};") + self.__data.append("") # for an extra empty line - if not os.path.isfile(script_args.unicode_data) or not os.access(script_args.unicode_data, os.R_OK): - print('The %s file is missing or not readable!' % script_args.unicode_data) - sys.exit(1) + def generate(self): + with open(self.__filepath, 'w') as generated_source: + generated_source.write("\n".join(self.__header)) + generated_source.write("\n".join(self.__data)) - if not os.path.isfile(script_args.special_casing) or not os.access(script_args.special_casing, os.R_OK): - print('The %s file is missing or not readable!' % script_args.special_casing) - sys.exit(1) - conv_tables = ConversionTables(script_args.unicode_data, script_args.special_casing) +# functions for unicode ranges - character_case_ranges = conv_tables.get_character_case_ranges() - character_pair_ranges = conv_tables.get_character_pair_ranges() - character_pairs = conv_tables.get_character_pairs() - upper_case_special_ranges = conv_tables.get_upper_case_special_ranges() - lower_case_ranges = conv_tables.get_lower_case_ranges() - lower_case_conversions = conv_tables.get_lower_case_conversions() - upper_case_conversions = conv_tables.get_upper_case_conversions() - c_source = UniCodeSource(script_args.c_source) +def read_categories(unicode_data_file): + """ + Read the corresponding unicode values and store them in category lists. - unicode_file = os.path.basename(script_args.unicode_data) - spec_casing_file = os.path.basename(script_args.special_casing) + :return: List of letters, non_letter and separators. + """ - header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__), - " * from %s and %s files. Do not edit! */" % (unicode_file, spec_casing_file), - ""] + # unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So + # letter: Lu Ll Lt Lm Lo Nl + # non-letter-indent-part: + # digit: Nd + # punctuation mark: Mn Mc + # connector punctuation: Pc + # separators: Zs + letter_category = ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"] + non_letter_category = ["Nd", "Mn", "Mc", "Pc"] + separator_category = ["Zs"] + + letters = [] + non_letters = [] + separators = [] - c_source.complete_header("\n".join(header_completion)) + with open(unicode_data_file) as unicode_data: + for line in csv.reader(unicode_data, delimiter=';'): + unicode_id = int(line[0], 16) - c_source.add_table(character_case_ranges[0], - "character_case_ranges", - "uint16_t", - ("/* Contains start points of character case ranges " - "(these are bidirectional conversions). */")) + # Skip supplementary planes and ascii chars + if unicode_id >= 0x10000 or unicode_id < 128: + continue - c_source.add_table(character_case_ranges[1], - "character_case_range_lengths", - "uint8_t", - "/* Interval lengths of start points in `character_case_ranges` table. */") + category = line[2] - c_source.add_table(character_pair_ranges[0], - "character_pair_ranges", - "uint16_t", - "/* Contains the start points of bidirectional conversion ranges. */") + if category in letter_category: + letters.append(unicode_id) + elif category in non_letter_category: + non_letters.append(unicode_id) + elif category in separator_category: + separators.append(unicode_id) - c_source.add_table(character_pair_ranges[1], - "character_pair_range_lengths", - "uint8_t", - "/* Interval lengths of start points in `character_pair_ranges` table. */") + # This separator char is handled separatly + non_breaking_space = 0x00A0 + if non_breaking_space in separators: + separators.remove(int(non_breaking_space)) - c_source.add_table(character_pairs, - "character_pairs", - "uint16_t", - "/* Contains lower/upper case bidirectional conversion pairs. */") + # These separator chars are not in the unicode data file or not in Zs category + mongolian_vowel_separator = 0x180E + medium_mathematical_space = 0x205F + zero_width_space = 0x200B - c_source.add_table(upper_case_special_ranges[0], - "upper_case_special_ranges", - "uint16_t", - ("/* Contains start points of one-to-two uppercase ranges where the second character\n" - " * is always the same.\n" - " */")) + if mongolian_vowel_separator not in separators: + bisect.insort(separators, int(mongolian_vowel_separator)) + if medium_mathematical_space not in separators: + bisect.insort(separators, int(medium_mathematical_space)) + if zero_width_space not in separators: + bisect.insort(separators, int(zero_width_space)) - c_source.add_table(upper_case_special_ranges[1], - "upper_case_special_range_lengths", - "uint8_t", - "/* Interval lengths for start points in `upper_case_special_ranges` table. */") + return letters, non_letters, separators - c_source.add_table(lower_case_ranges[0], - "lower_case_ranges", - "uint16_t", - "/* Contains start points of lowercase ranges. */") - c_source.add_table(lower_case_ranges[1], - "lower_case_range_lengths", - "uint8_t", - "/* Interval lengths for start points in `lower_case_ranges` table. */") +def group_ranges(i): + """ + Convert an increasing list of integers into a range list - c_source.add_table(lower_case_conversions[0], - "lower_case_conversions", - "uint16_t", - ("/* The remaining lowercase conversions. The lowercase variant can " - "be one-to-three character long. */")) + :return: List of ranges. + """ + for _, group in itertools.groupby(enumerate(i), lambda q: (q[1] - q[0])): + group = list(group) + yield group[0][1], group[-1][1] - c_source.add_table(lower_case_conversions[1], - "lower_case_conversion_counters", - "uint8_t", - "/* Number of one-to-one, one-to-two, and one-to-three lowercase conversions. */") - c_source.add_table(upper_case_conversions[0], - "upper_case_conversions", - "uint16_t", - ("/* The remaining uppercase conversions. The uppercase variant can " - "be one-to-three character long. */")) +def split_list(category_list): + """ + Split list of ranges into intervals and single char lists. - c_source.add_table(upper_case_conversions[1], - "upper_case_conversion_counters", - "uint8_t", - "/* Number of one-to-one, one-to-two, and one-to-three uppercase conversions. */") + :return: List of interval starting points, interval lengths and single chars + """ - c_source.generate() + interval_sps = [] + interval_lengths = [] + chars = [] + + for element in category_list: + interval_length = element[1] - element[0] + if interval_length == 0: + chars.append(element[0]) + elif interval_length > 255: + for i in range(element[0], element[1], 256): + length = 255 if (element[1] - i > 255) else (element[1] - i) + interval_sps.append(i) + interval_lengths.append(length) + else: + interval_sps.append(element[0]) + interval_lengths.append(element[1] - element[0]) + return interval_sps, interval_lengths, chars -class ConversionTables(object): - def __init__(self, unicode_data_file, special_casing_file): - """ - Read the corresponding unicode values of lower and upper case letters and store these in tables - :param unicode_data_file: Contains the default case mappings (one-to-one mappings). - :param special_casing_file: Contains additional informative case mappings that are either not one-to-one - or which are context-sensitive. - """ +def generate_ranges(script_args): + letters, non_letters, separators = read_categories(script_args.unicode_data) - case_mappings = read_case_mappings(unicode_data_file, special_casing_file) - lower_case = case_mappings[0] - upper_case = case_mappings[1] + letter_tables = split_list(list(group_ranges(letters))) + non_letter_tables = split_list(list(group_ranges(non_letters))) + separator_tables = split_list(list(group_ranges(separators))) - self.__character_case_ranges = extract_ranges(lower_case, upper_case) - self.__character_pair_ranges = extract_character_pair_ranges(lower_case, upper_case) - self.__character_pairs = extract_character_pairs(lower_case, upper_case) - self.__upper_case_special_ranges = extract_special_ranges(upper_case) - self.__lower_case_ranges = extract_ranges(lower_case) - self.__lower_case_conversions = extract_conversions(lower_case) - self.__upper_case_conversions = extract_conversions(upper_case) + c_source = UniCodeSource(RANGES_C_SOURCE) - if lower_case: - warnings.warn('Not all elements extracted from the lowercase table!') - if upper_case: - warnings.warn('Not all elements extracted from the uppercase table!') + header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__), + " * from %s. Do not edit! */" % os.path.basename(script_args.unicode_data), + ""] - def get_character_case_ranges(self): - return self.__character_case_ranges + c_source.complete_header("\n".join(header_completion)) - def get_character_pair_ranges(self): - return self.__character_pair_ranges + c_source.add_table(letter_tables[0], + "unicode_letter_interval_sps", + "uint16_t", + ("/**\n" + " * Character interval starting points for the unicode letters.\n" + " *\n" + " * The characters covered by these intervals are from\n" + " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n" + " */")) + + c_source.add_table(letter_tables[1], + "unicode_letter_interval_lengths", + "uint8_t", + ("/**\n" + " * Character lengths for the unicode letters.\n" + " *\n" + " * The characters covered by these intervals are from\n" + " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n" + " */")) + + c_source.add_table(letter_tables[2], + "unicode_letter_chars", + "uint16_t", + ("/**\n" + " * Those unicode letter characters that are not inside any of\n" + " * the intervals specified in jerry_unicode_letter_interval_sps array.\n" + " *\n" + " * The characters are from the following Unicode categories:\n" + " * Lu, Ll, Lt, Lm, Lo, Nl\n" + " */")) - def get_character_pairs(self): - return self.__character_pairs + c_source.add_table(non_letter_tables[0], + "unicode_non_letter_ident_part_interval_sps", + "uint16_t", + ("/**\n" + " * Character interval starting points for non-letter character\n" + " * that can be used as a non-first character of an identifier.\n" + " *\n" + " * The characters covered by these intervals are from\n" + " * the following Unicode categories: Nd, Mn, Mc, Pc\n" + " */")) - def get_upper_case_special_ranges(self): - return self.__upper_case_special_ranges + c_source.add_table(non_letter_tables[1], + "unicode_non_letter_ident_part_interval_lengths", + "uint8_t", + ("/**\n" + " * Character interval lengths for non-letter character\n" + " * that can be used as a non-first character of an identifier.\n" + " *\n" + " * The characters covered by these intervals are from\n" + " * the following Unicode categories: Nd, Mn, Mc, Pc\n" + " */")) - def get_lower_case_ranges(self): - return self.__lower_case_ranges + c_source.add_table(non_letter_tables[2], + "unicode_non_letter_ident_part_chars", + "uint16_t", + ("/**\n" + " * Those non-letter characters that can be used as a non-first\n" + " * character of an identifier and not included in any of the intervals\n" + " * specified in jerry_unicode_non_letter_ident_part_interval_sps array.\n" + " *\n" + " * The characters are from the following Unicode categories:\n" + " * Nd, Mn, Mc, Pc\n" + " */")) - def get_lower_case_conversions(self): - return self.__lower_case_conversions + c_source.add_table(separator_tables[0], + "unicode_separator_char_interval_sps", + "uint16_t", + ("/**\n" + " * Unicode separator character interval starting points from Unicode category: Zs\n" + " */")) - def get_upper_case_conversions(self): - return self.__upper_case_conversions + c_source.add_table(separator_tables[1], + "unicode_separator_char_interval_lengths", + "uint8_t", + ("/**\n" + " * Unicode separator character interval lengths from Unicode category: Zs\n" + " */")) + + c_source.add_table(separator_tables[2], + "unicode_separator_chars", + "uint16_t", + ("/**\n" + " * Unicode separator characters that are not in the\n" + " * jerry_unicode_separator_char_intervals array.\n" + " *\n" + " * Unicode category: Zs\n" + " */")) + + c_source.generate() + + +# functions for unicode conversions def parse_unicode_sequence(raw_data): @@ -395,7 +460,7 @@ def extract_character_pair_ranges(letter_case, reverse_letter_case): else: in_range = False - # Remove all founded case mapping from the conversion tables after the scanning method + # Remove all found case mapping from the conversion tables after the scanning method for idx, letter_id in enumerate(start_points): conv_length = lengths[idx] @@ -581,5 +646,138 @@ def calculate_conversion_distance(letter_case, letter_id): return ord(letter_case[letter_id]) - letter_id +def generate_conversions(script_args): + # Read the corresponding unicode values of lower and upper case letters and store these in tables + case_mappings = read_case_mappings(script_args.unicode_data, script_args.special_casing) + lower_case = case_mappings[0] + upper_case = case_mappings[1] + + character_case_ranges = extract_ranges(lower_case, upper_case) + character_pair_ranges = extract_character_pair_ranges(lower_case, upper_case) + character_pairs = extract_character_pairs(lower_case, upper_case) + upper_case_special_ranges = extract_special_ranges(upper_case) + lower_case_ranges = extract_ranges(lower_case) + lower_case_conversions = extract_conversions(lower_case) + upper_case_conversions = extract_conversions(upper_case) + + if lower_case: + warnings.warn('Not all elements extracted from the lowercase table!') + if upper_case: + warnings.warn('Not all elements extracted from the uppercase table!') + + # Generate conversions output + c_source = UniCodeSource(CONVERSIONS_C_SOURCE) + + unicode_file = os.path.basename(script_args.unicode_data) + spec_casing_file = os.path.basename(script_args.special_casing) + + header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__), + " * from %s and %s files. Do not edit! */" % (unicode_file, spec_casing_file), + ""] + + c_source.complete_header("\n".join(header_completion)) + + c_source.add_table(character_case_ranges[0], + "character_case_ranges", + "uint16_t", + ("/* Contains start points of character case ranges " + "(these are bidirectional conversions). */")) + + c_source.add_table(character_case_ranges[1], + "character_case_range_lengths", + "uint8_t", + "/* Interval lengths of start points in `character_case_ranges` table. */") + + c_source.add_table(character_pair_ranges[0], + "character_pair_ranges", + "uint16_t", + "/* Contains the start points of bidirectional conversion ranges. */") + + c_source.add_table(character_pair_ranges[1], + "character_pair_range_lengths", + "uint8_t", + "/* Interval lengths of start points in `character_pair_ranges` table. */") + + c_source.add_table(character_pairs, + "character_pairs", + "uint16_t", + "/* Contains lower/upper case bidirectional conversion pairs. */") + + c_source.add_table(upper_case_special_ranges[0], + "upper_case_special_ranges", + "uint16_t", + ("/* Contains start points of one-to-two uppercase ranges where the second character\n" + " * is always the same.\n" + " */")) + + c_source.add_table(upper_case_special_ranges[1], + "upper_case_special_range_lengths", + "uint8_t", + "/* Interval lengths for start points in `upper_case_special_ranges` table. */") + + c_source.add_table(lower_case_ranges[0], + "lower_case_ranges", + "uint16_t", + "/* Contains start points of lowercase ranges. */") + + c_source.add_table(lower_case_ranges[1], + "lower_case_range_lengths", + "uint8_t", + "/* Interval lengths for start points in `lower_case_ranges` table. */") + + c_source.add_table(lower_case_conversions[0], + "lower_case_conversions", + "uint16_t", + ("/* The remaining lowercase conversions. The lowercase variant can " + "be one-to-three character long. */")) + + c_source.add_table(lower_case_conversions[1], + "lower_case_conversion_counters", + "uint8_t", + "/* Number of one-to-one, one-to-two, and one-to-three lowercase conversions. */") + + c_source.add_table(upper_case_conversions[0], + "upper_case_conversions", + "uint16_t", + ("/* The remaining uppercase conversions. The uppercase variant can " + "be one-to-three character long. */")) + + c_source.add_table(upper_case_conversions[1], + "upper_case_conversion_counters", + "uint8_t", + "/* Number of one-to-one, one-to-two, and one-to-three uppercase conversions. */") + + c_source.generate() + + +# entry point + + +def main(): + parser = argparse.ArgumentParser(description='lit-unicode-{conversions,ranges}.inc.h generator', + epilog=''' + The input files (UnicodeData.txt, SpecialCasing.txt) + must be retrieved from + http://www.unicode.org/Public//ucd/. + The last known good version is 9.0.0. + ''') + + parser.add_argument('--unicode-data', metavar='FILE', action='store', required=True, + help='specify the unicode data file') + parser.add_argument('--special-casing', metavar='FILE', action='store', required=True, + help='specify the special casing file') + + script_args = parser.parse_args() + + if not os.path.isfile(script_args.unicode_data) or not os.access(script_args.unicode_data, os.R_OK): + parser.error('The %s file is missing or not readable!' % script_args.unicode_data) + + if not os.path.isfile(script_args.special_casing) or not os.access(script_args.special_casing, os.R_OK): + parser.error('The %s file is missing or not readable!' % script_args.special_casing) + + generate_ranges(script_args) + generate_conversions(script_args) + + if __name__ == "__main__": main() diff --git a/tools/c_source_helper.py b/tools/gen_c_source.py similarity index 58% rename from tools/c_source_helper.py rename to tools/gen_c_source.py index 89c18c0886..7dd0c2e2b9 100644 --- a/tools/c_source_helper.py +++ b/tools/gen_c_source.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python - # Copyright JS Foundation and other contributors, http://js.foundation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -31,42 +29,16 @@ */""" -class UniCodeSource(object): - def __init__(self, filepath): - self.__filepath = filepath - self.__header = [LICENSE, ""] - self.__data = [] - - def complete_header(self, completion): - self.__header.append(completion) - self.__header.append("") # for an extra empty line - - def add_table(self, table, table_name, table_type, table_descr): - self.__data.append(table_descr) - self.__data.append("static const %s jerry_%s[] JERRY_CONST_DATA =" % (table_type, table_name)) - self.__data.append("{") - self.__data.append(format_code(table, 1)) - self.__data.append("};") - self.__data.append("") # for an extra empty line - - def generate(self): - with open(self.__filepath, 'w') as generated_source: - generated_source.write("\n".join(self.__header)) - generated_source.write("\n".join(self.__data)) - - -def regroup(list_to_group, num): - return [list_to_group[i:i+num] for i in range(0, len(list_to_group), num)] - - -def hex_format(char, digit_number): - if isinstance(char, str): - char = ord(char) +def format_code(code, indent, digit_number=4): + def regroup(list_to_group, num): + return [list_to_group[i:i+num] for i in range(0, len(list_to_group), num)] - return ("0x{:0%sx}" % digit_number).format(char) + def hex_format(char, digit_number): + if isinstance(char, str): + char = ord(char) + return ("0x{:0%sx}" % digit_number).format(char) -def format_code(code, indent, digit_number=4): lines = [] nums_per_line = 10 diff --git a/tools/js2c.py b/tools/js2c.py index 63a66042f2..63733bc6b0 100755 --- a/tools/js2c.py +++ b/tools/js2c.py @@ -20,7 +20,9 @@ import glob import os import re -import c_source_helper + +from gen_c_source import LICENSE, format_code + HEADER = '''#ifndef JERRY_TARGETJS_H #define JERRY_TARGETJS_H @@ -61,7 +63,7 @@ def js_to_native_code(path, name, build_type): if build_type != 'debug': code = reduce_code(code) - data = c_source_helper.format_code(code, 1, 2) + data = format_code(code, 1, 2) native_code = """const static char {0}_n[] = "{0}"; const static char {0}_s[] = @@ -96,7 +98,7 @@ def main(): gen_line = "/* This file is generated by %s. Please do not modify. */" % os.path.basename(__file__) - gen_output = [c_source_helper.LICENSE, "", gen_line, "", HEADER] + gen_output = [LICENSE, "", gen_line, "", HEADER] gen_structs = [NATIVE_STRUCT] if script_args.main: diff --git a/tools/unicode_ranges.py b/tools/unicode_ranges.py deleted file mode 100755 index 67d8069619..0000000000 --- a/tools/unicode_ranges.py +++ /dev/null @@ -1,260 +0,0 @@ -#!/usr/bin/env python - -# Copyright JS Foundation and other contributors, http://js.foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# -# http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt -# - -# unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So -# letter: Lu Ll Lt Lm Lo Nl -# non-letter-indent-part: -# digit: Nd -# punctuation mark: Mn Mc -# connector punctuation: Pc -# separators: Zs - -from __future__ import print_function - -import argparse -import bisect -import csv -import itertools -import os -import sys - -from c_source_helper import UniCodeSource -from settings import PROJECT_DIR - -RANGES_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges.inc.h') - -def main(): - parser = argparse.ArgumentParser() - - parser.add_argument('unicode_data', - metavar='FILE', - action='store', - help='specify the unicode data file') - - parser.add_argument('--c-source', - metavar='FILE', - action='store', - default=RANGES_C_SOURCE, - help='specify the output c source (default: %(default)s)') - - script_args = parser.parse_args() - - if not os.path.isfile(script_args.unicode_data) or not os.access(script_args.unicode_data, os.R_OK): - print('The %s file is missing or not readable!' % script_args.unicode_data) - sys.exit(1) - - letters, non_letters, separators = read_categories(script_args.unicode_data) - - letter_tables = split_list(list(ranges(letters))) - non_letter_tables = split_list(list(ranges(non_letters))) - separator_tables = split_list(list(ranges(separators))) - - c_source = UniCodeSource(script_args.c_source) - - header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__), - " * from %s. Do not edit! */" % os.path.basename(script_args.unicode_data), - ""] - - c_source.complete_header("\n".join(header_completion)) - - c_source.add_table(letter_tables[0], - "unicode_letter_interval_sps", - "uint16_t", - ("/**\n" - " * Character interval starting points for the unicode letters.\n" - " *\n" - " * The characters covered by these intervals are from\n" - " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n" - " */")) - - c_source.add_table(letter_tables[1], - "unicode_letter_interval_lengths", - "uint8_t", - ("/**\n" - " * Character lengths for the unicode letters.\n" - " *\n" - " * The characters covered by these intervals are from\n" - " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n" - " */")) - - c_source.add_table(letter_tables[2], - "unicode_letter_chars", - "uint16_t", - ("/**\n" - " * Those unicode letter characters that are not inside any of\n" - " * the intervals specified in jerry_unicode_letter_interval_sps array.\n" - " *\n" - " * The characters are from the following Unicode categories:\n" - " * Lu, Ll, Lt, Lm, Lo, Nl\n" - " */")) - - c_source.add_table(non_letter_tables[0], - "unicode_non_letter_ident_part_interval_sps", - "uint16_t", - ("/**\n" - " * Character interval starting points for non-letter character\n" - " * that can be used as a non-first character of an identifier.\n" - " *\n" - " * The characters covered by these intervals are from\n" - " * the following Unicode categories: Nd, Mn, Mc, Pc\n" - " */")) - - c_source.add_table(non_letter_tables[1], - "unicode_non_letter_ident_part_interval_lengths", - "uint8_t", - ("/**\n" - " * Character interval lengths for non-letter character\n" - " * that can be used as a non-first character of an identifier.\n" - " *\n" - " * The characters covered by these intervals are from\n" - " * the following Unicode categories: Nd, Mn, Mc, Pc\n" - " */")) - - c_source.add_table(non_letter_tables[2], - "unicode_non_letter_ident_part_chars", - "uint16_t", - ("/**\n" - " * Those non-letter characters that can be used as a non-first\n" - " * character of an identifier and not included in any of the intervals\n" - " * specified in jerry_unicode_non_letter_ident_part_interval_sps array.\n" - " *\n" - " * The characters are from the following Unicode categories:\n" - " * Nd, Mn, Mc, Pc\n" - " */")) - - c_source.add_table(separator_tables[0], - "unicode_separator_char_interval_sps", - "uint16_t", - ("/**\n" - " * Unicode separator character interval starting points from Unicode category: Zs\n" - " */")) - - c_source.add_table(separator_tables[1], - "unicode_separator_char_interval_lengths", - "uint8_t", - ("/**\n" - " * Unicode separator character interval lengths from Unicode category: Zs\n" - " */")) - - c_source.add_table(separator_tables[2], - "unicode_separator_chars", - "uint16_t", - ("/**\n" - " * Unicode separator characters that are not in the\n" - " * jerry_unicode_separator_char_intervals array.\n" - " *\n" - " * Unicode category: Zs\n" - " */")) - - c_source.generate() - - -def read_categories(unicode_data_file): - """ - Read the corresponding unicode values and store them in category lists. - - :return: List of letters, non_letter and separators. - """ - - letter_category = ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"] - non_letter_category = ["Nd", "Mn", "Mc", "Pc"] - separator_category = ["Zs"] - - letters = [] - non_letters = [] - separators = [] - - with open(unicode_data_file) as unicode_data: - for line in csv.reader(unicode_data, delimiter=';'): - unicode_id = int(line[0], 16) - - # Skip supplementary planes and ascii chars - if unicode_id >= 0x10000 or unicode_id < 128: - continue - - category = line[2] - - if category in letter_category: - letters.append(unicode_id) - elif category in non_letter_category: - non_letters.append(unicode_id) - elif category in separator_category: - separators.append(unicode_id) - - # This separator char is handled separatly - non_breaking_space = 0x00A0 - if non_breaking_space in separators: - separators.remove(int(non_breaking_space)) - - # These separator chars are not in the unicode data file or not in Zs category - mongolian_vowel_separator = 0x180E - medium_mathematical_space = 0x205F - zero_width_space = 0x200B - - if mongolian_vowel_separator not in separators: - bisect.insort(separators, int(mongolian_vowel_separator)) - if medium_mathematical_space not in separators: - bisect.insort(separators, int(medium_mathematical_space)) - if zero_width_space not in separators: - bisect.insort(separators, int(zero_width_space)) - - return letters, non_letters, separators - - -def ranges(i): - """ - Convert an increasing list of integers into a range list - - :return: List of ranges. - """ - for _, group in itertools.groupby(enumerate(i), lambda q: (q[1] - q[0])): - group = list(group) - yield group[0][1], group[-1][1] - - -def split_list(category_list): - """ - Split list of ranges into intervals and single char lists. - - :return: List of interval starting points, interval lengths and single chars - """ - - interval_sps = [] - interval_lengths = [] - chars = [] - - for element in category_list: - interval_length = element[1] - element[0] - if interval_length == 0: - chars.append(element[0]) - elif interval_length > 255: - for i in range(element[0], element[1], 256): - length = 255 if (element[1] - i > 255) else (element[1] - i) - interval_sps.append(i) - interval_lengths.append(length) - else: - interval_sps.append(element[0]) - interval_lengths.append(element[1] - element[0]) - - return interval_sps, interval_lengths, chars - - -if __name__ == "__main__": - main()