From 70a56e60f3fe0107786c8e5ddae913440764966f Mon Sep 17 00:00:00 2001 From: Alejandro Alonso Date: Fri, 16 Feb 2024 17:11:00 -0800 Subject: [PATCH 1/2] Add Unicode Word Break generator --- utils/gen-unicode-data/Input/WordData.h | 20 +++ utils/gen-unicode-data/Package.swift | 8 +- .../Sources/GenWorkBreak/main.swift | 164 ++++++++++++++++++ 3 files changed, 190 insertions(+), 2 deletions(-) create mode 100644 utils/gen-unicode-data/Input/WordData.h create mode 100644 utils/gen-unicode-data/Sources/GenWorkBreak/main.swift diff --git a/utils/gen-unicode-data/Input/WordData.h b/utils/gen-unicode-data/Input/WordData.h new file mode 100644 index 0000000000000..4384517be6bc9 --- /dev/null +++ b/utils/gen-unicode-data/Input/WordData.h @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + +// This was auto-generated by utils/gen-unicode-data/GenWordBreak, +// please do not edit this file yourself! + +#ifndef WORD_DATA_H +#define WORD_DATA_H + +#include "swift/shims/SwiftStdint.h" + diff --git a/utils/gen-unicode-data/Package.swift b/utils/gen-unicode-data/Package.swift index 8fd28ebd2b934..fdab0841d40a9 100644 --- a/utils/gen-unicode-data/Package.swift +++ b/utils/gen-unicode-data/Package.swift @@ -1,10 +1,10 @@ -// swift-tools-version:5.4 +// swift-tools-version:5.6 import PackageDescription let package = Package( name: "GenUnicodeData", - platforms: [.macOS(.v10_15)], + platforms: [.macOS(.v12)], targets: [ .target( name: "GenUtils", @@ -21,6 +21,10 @@ let package = Package( .executableTarget( name: "GenScalarProps", dependencies: ["GenUtils"] + ), + .executableTarget( + name: "GenWordBreak", + dependencies: ["GenUtils"] ) ] ) diff --git a/utils/gen-unicode-data/Sources/GenWorkBreak/main.swift b/utils/gen-unicode-data/Sources/GenWorkBreak/main.swift new file mode 100644 index 0000000000000..fddeb81a4e96f --- /dev/null +++ b/utils/gen-unicode-data/Sources/GenWorkBreak/main.swift @@ -0,0 +1,164 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + +import GenUtils + +extension Unicode { + enum WordBreakProperty: UInt8 { + // We don't store the other properties, so we really don't care about them + // here. + + case extend = 0 + case format = 1 + case katakana = 2 + case hebrewLetter = 3 + case aLetter = 4 + case midNumLet = 5 + case midLetter = 6 + case midNum = 7 + case numeric = 8 + case extendNumLet = 9 + case wSegSpace = 10 + case extendedPictographic = 11 + + init?(_ str: String) { + switch str { + case "Extend": + self = .extend + case "Format": + self = .format + case "Katakana": + self = .katakana + case "Hebrew_Letter": + self = .hebrewLetter + case "ALetter": + self = .aLetter + case "MidNumLet": + self = .midNumLet + case "MidLetter": + self = .midLetter + case "MidNum": + self = .midNum + case "Numeric": + self = .numeric + case "ExtendNumLet": + self = .extendNumLet + case "WSegSpace": + self = .wSegSpace + case "Extended_Pictographic": + self = .extendedPictographic + default: + return nil + } + } + } +} + +func getWordBreakPropertyData( + for path: String +) -> [(ClosedRange, Unicode.WordBreakProperty)] { + let data = readFile(path) + + var unflattened: [(ClosedRange, Unicode.WordBreakProperty)] = [] + + for line in data.split(separator: "\n") { + // Skip comments + guard !line.hasPrefix("#") else { + continue + } + + // Each line in this file is broken up into two sections: + // 1: Either the singular scalar or a range of scalars who conform to said + // grapheme break property. + // 2: The grapheme break property that said scalar(s) conform to (with + // additional comments noting the character category, name and amount of + // scalars the range represents). + let components = line.split(separator: ";") + + // Get the property first because it may be one we don't care about. + let splitProperty = components[1].split(separator: "#") + let filteredProperty = splitProperty[0].filter { !$0.isWhitespace } + + guard let gbp = Unicode.WordBreakProperty(filteredProperty) else { + continue + } + + let scalars: ClosedRange + + let filteredScalars = components[0].filter { !$0.isWhitespace } + + // If we have . appear, it means we have a legitimate range. Otherwise, + // it's a singular scalar. + if filteredScalars.contains(".") { + let range = filteredScalars.split(separator: ".") + + scalars = UInt32(range[0], radix: 16)! ... UInt32(range[1], radix: 16)! + } else { + let scalar = UInt32(filteredScalars, radix: 16)! + + scalars = scalar ... scalar + } + + unflattened.append((scalars, gbp)) + } + + return flatten(unflattened) +} + +func emit( + _ data: [(ClosedRange, Unicode.WordBreakProperty)], + into result: inout String +) { + emitCollection( + data, + name: "_swift_stdlib_words", + type: "__swift_uint32_t", + into: &result + ) { + var value = $0.0.lowerBound + value |= UInt32($0.0.count) << 21 + + return "0x\(String(value, radix: 16, uppercase: true))" + } + + emitCollection( + data, + name: "_swift_stdlib_words_data", + type: "__swift_uint8_t", + into: &result + ) { + let value = $0.1.rawValue + + return "0x\(String(value, radix: 16, uppercase: true))" + } +} + +// Main entry point into the grapheme break property generator. +func generateGraphemeBreakProperty() { + var result = readFile("Input/WordData.h") + + let baseData = getWordBreakPropertyData(for: "Data/15/WordBreakProperty.txt") + let emojiData = getWordBreakPropertyData(for: "Data/15/emoji-data.txt") + + let data = flatten(baseData + emojiData) + + emit(data, into: &result) + + result += """ + #endif // #ifndef WORD_DATA_H + + """ + + write(result, to: "Output/Common/WordData.h") +} + +generateGraphemeBreakProperty() From ad5aac0d371366383ec6ae1579b3bbb6895304dc Mon Sep 17 00:00:00 2001 From: Alejandro Alonso Date: Fri, 16 Feb 2024 17:41:52 -0800 Subject: [PATCH 2/2] Fix folder name --- .../Sources/{GenWorkBreak => GenWordBreak}/main.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename utils/gen-unicode-data/Sources/{GenWorkBreak => GenWordBreak}/main.swift (98%) diff --git a/utils/gen-unicode-data/Sources/GenWorkBreak/main.swift b/utils/gen-unicode-data/Sources/GenWordBreak/main.swift similarity index 98% rename from utils/gen-unicode-data/Sources/GenWorkBreak/main.swift rename to utils/gen-unicode-data/Sources/GenWordBreak/main.swift index fddeb81a4e96f..ad424938d6cb3 100644 --- a/utils/gen-unicode-data/Sources/GenWorkBreak/main.swift +++ b/utils/gen-unicode-data/Sources/GenWordBreak/main.swift @@ -2,7 +2,7 @@ // // This source file is part of the Swift.org open source project // -// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Copyright (c) 2024 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information