Skip to content

Commit 30984f5

Browse files
authored
Merge pull request #71699 from Azoy/gen-word-view
[utils] Add Unicode Word Break generator
2 parents 636a326 + ad5aac0 commit 30984f5

File tree

3 files changed

+190
-2
lines changed

3 files changed

+190
-2
lines changed
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
// This was auto-generated by utils/gen-unicode-data/GenWordBreak,
14+
// please do not edit this file yourself!
15+
16+
#ifndef WORD_DATA_H
17+
#define WORD_DATA_H
18+
19+
#include "swift/shims/SwiftStdint.h"
20+

utils/gen-unicode-data/Package.swift

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
// swift-tools-version:5.4
1+
// swift-tools-version:5.6
22

33
import PackageDescription
44

55
let package = Package(
66
name: "GenUnicodeData",
7-
platforms: [.macOS(.v10_15)],
7+
platforms: [.macOS(.v12)],
88
targets: [
99
.target(
1010
name: "GenUtils",
@@ -21,6 +21,10 @@ let package = Package(
2121
.executableTarget(
2222
name: "GenScalarProps",
2323
dependencies: ["GenUtils"]
24+
),
25+
.executableTarget(
26+
name: "GenWordBreak",
27+
dependencies: ["GenUtils"]
2428
)
2529
]
2630
)
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2024 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
import GenUtils
14+
15+
extension Unicode {
16+
enum WordBreakProperty: UInt8 {
17+
// We don't store the other properties, so we really don't care about them
18+
// here.
19+
20+
case extend = 0
21+
case format = 1
22+
case katakana = 2
23+
case hebrewLetter = 3
24+
case aLetter = 4
25+
case midNumLet = 5
26+
case midLetter = 6
27+
case midNum = 7
28+
case numeric = 8
29+
case extendNumLet = 9
30+
case wSegSpace = 10
31+
case extendedPictographic = 11
32+
33+
init?(_ str: String) {
34+
switch str {
35+
case "Extend":
36+
self = .extend
37+
case "Format":
38+
self = .format
39+
case "Katakana":
40+
self = .katakana
41+
case "Hebrew_Letter":
42+
self = .hebrewLetter
43+
case "ALetter":
44+
self = .aLetter
45+
case "MidNumLet":
46+
self = .midNumLet
47+
case "MidLetter":
48+
self = .midLetter
49+
case "MidNum":
50+
self = .midNum
51+
case "Numeric":
52+
self = .numeric
53+
case "ExtendNumLet":
54+
self = .extendNumLet
55+
case "WSegSpace":
56+
self = .wSegSpace
57+
case "Extended_Pictographic":
58+
self = .extendedPictographic
59+
default:
60+
return nil
61+
}
62+
}
63+
}
64+
}
65+
66+
func getWordBreakPropertyData(
67+
for path: String
68+
) -> [(ClosedRange<UInt32>, Unicode.WordBreakProperty)] {
69+
let data = readFile(path)
70+
71+
var unflattened: [(ClosedRange<UInt32>, Unicode.WordBreakProperty)] = []
72+
73+
for line in data.split(separator: "\n") {
74+
// Skip comments
75+
guard !line.hasPrefix("#") else {
76+
continue
77+
}
78+
79+
// Each line in this file is broken up into two sections:
80+
// 1: Either the singular scalar or a range of scalars who conform to said
81+
// grapheme break property.
82+
// 2: The grapheme break property that said scalar(s) conform to (with
83+
// additional comments noting the character category, name and amount of
84+
// scalars the range represents).
85+
let components = line.split(separator: ";")
86+
87+
// Get the property first because it may be one we don't care about.
88+
let splitProperty = components[1].split(separator: "#")
89+
let filteredProperty = splitProperty[0].filter { !$0.isWhitespace }
90+
91+
guard let gbp = Unicode.WordBreakProperty(filteredProperty) else {
92+
continue
93+
}
94+
95+
let scalars: ClosedRange<UInt32>
96+
97+
let filteredScalars = components[0].filter { !$0.isWhitespace }
98+
99+
// If we have . appear, it means we have a legitimate range. Otherwise,
100+
// it's a singular scalar.
101+
if filteredScalars.contains(".") {
102+
let range = filteredScalars.split(separator: ".")
103+
104+
scalars = UInt32(range[0], radix: 16)! ... UInt32(range[1], radix: 16)!
105+
} else {
106+
let scalar = UInt32(filteredScalars, radix: 16)!
107+
108+
scalars = scalar ... scalar
109+
}
110+
111+
unflattened.append((scalars, gbp))
112+
}
113+
114+
return flatten(unflattened)
115+
}
116+
117+
func emit(
118+
_ data: [(ClosedRange<UInt32>, Unicode.WordBreakProperty)],
119+
into result: inout String
120+
) {
121+
emitCollection(
122+
data,
123+
name: "_swift_stdlib_words",
124+
type: "__swift_uint32_t",
125+
into: &result
126+
) {
127+
var value = $0.0.lowerBound
128+
value |= UInt32($0.0.count) << 21
129+
130+
return "0x\(String(value, radix: 16, uppercase: true))"
131+
}
132+
133+
emitCollection(
134+
data,
135+
name: "_swift_stdlib_words_data",
136+
type: "__swift_uint8_t",
137+
into: &result
138+
) {
139+
let value = $0.1.rawValue
140+
141+
return "0x\(String(value, radix: 16, uppercase: true))"
142+
}
143+
}
144+
145+
// Main entry point into the grapheme break property generator.
146+
func generateGraphemeBreakProperty() {
147+
var result = readFile("Input/WordData.h")
148+
149+
let baseData = getWordBreakPropertyData(for: "Data/15/WordBreakProperty.txt")
150+
let emojiData = getWordBreakPropertyData(for: "Data/15/emoji-data.txt")
151+
152+
let data = flatten(baseData + emojiData)
153+
154+
emit(data, into: &result)
155+
156+
result += """
157+
#endif // #ifndef WORD_DATA_H
158+
159+
"""
160+
161+
write(result, to: "Output/Common/WordData.h")
162+
}
163+
164+
generateGraphemeBreakProperty()

0 commit comments

Comments
 (0)