Skip to content

Commit 7c59254

Browse files
authored
Merge pull request #62863 from lorentey/character-recognizer-5.8
[5.8][stdlib] Export grapheme breaking facility
2 parents d108a04 + 83f983e commit 7c59254

File tree

5 files changed

+314
-98
lines changed

5 files changed

+314
-98
lines changed

stdlib/private/StdlibUnicodeUnittest/GraphemeBreaking.swift

Lines changed: 29 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -15,56 +15,47 @@
1515
#if _runtime(_ObjC)
1616
import Foundation
1717

18-
func parseGraphemeBreakTests(
19-
_ data: String,
20-
into result: inout [(String, Int)]
21-
) {
22-
for line in data.split(separator: "\n") {
18+
public struct GraphemeBreakTest {
19+
public let string: String
20+
public let pieces: [[Unicode.Scalar]]
21+
22+
init?(line: some StringProtocol) {
2323
// Only look at actual tests
24-
guard line.hasPrefix("÷") else {
25-
continue
26-
}
24+
guard line.hasPrefix("÷") else { return nil }
2725

2826
let info = line.split(separator: "#")
2927
let components = info[0].split(separator: " ")
3028

3129
var string = ""
32-
var count = 0
33-
34-
for i in components.indices {
35-
guard i != 0 else {
36-
continue
37-
}
38-
39-
let scalar: Unicode.Scalar
40-
41-
// If we're an odd index, this is a scalar.
42-
if i & 0x1 == 1 {
43-
scalar = Unicode.Scalar(UInt32(components[i], radix: 16)!)!
44-
30+
var pieces: [[Unicode.Scalar]] = []
31+
32+
var piece: [Unicode.Scalar] = []
33+
for component in components {
34+
switch component {
35+
case "":
36+
break
37+
case "×": // no grapheme break opportunity
38+
break
39+
case "÷": // grapheme break opportunity
40+
guard !piece.isEmpty else { break }
41+
pieces.append(piece)
42+
piece = []
43+
case _: // hexadecimal scalar value
44+
guard let value = UInt32(component, radix: 16) else { return nil }
45+
guard let scalar = Unicode.Scalar(value) else { return nil }
4546
string.unicodeScalars.append(scalar)
46-
} else {
47-
// Otherwise, it is a grapheme breaking operator.
48-
49-
// If this is a break, record the +1 count. Otherwise it is × which is
50-
// not a break.
51-
if components[i] == "÷" {
52-
count += 1
53-
}
47+
piece.append(scalar)
5448
}
5549
}
56-
57-
result.append((string, count))
50+
if !piece.isEmpty { pieces.append(piece) }
51+
self.string = string
52+
self.pieces = pieces
5853
}
5954
}
6055

61-
public let graphemeBreakTests: [(String, Int)] = {
62-
var result: [(String, Int)] = []
63-
56+
public let graphemeBreakTests: [GraphemeBreakTest] = {
6457
let testFile = readInputFile("GraphemeBreakTest.txt")
65-
66-
parseGraphemeBreakTests(testFile, into: &result)
67-
68-
return result
58+
return testFile.split(separator: "\n")
59+
.compactMap { GraphemeBreakTest(line: $0) }
6960
}()
7061
#endif

stdlib/public/core/StringGraphemeBreaking.swift

Lines changed: 129 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
//
33
// This source file is part of the Swift.org open source project
44
//
5-
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
5+
// Copyright (c) 2014 - 2023 Apple Inc. and the Swift project authors
66
// Licensed under Apache License v2.0 with Runtime Library Exception
77
//
88
// See https://swift.org/LICENSE.txt for license information
@@ -436,6 +436,117 @@ internal struct _GraphemeBreakingState {
436436
var shouldBreakRI = false
437437
}
438438

439+
extension Unicode {
440+
/// A state machine for recognizing character (i.e., extended grapheme
441+
/// cluster) boundaries in an arbitrary series of Unicode scalars.
442+
///
443+
/// To detect grapheme breaks in a sequence of Unicode scalars, feed each of
444+
/// them to the `hasBreak(before:)` method. The method returns true if the
445+
/// sequence has a grapheme break preceding the given value.
446+
///
447+
/// The results produced by this state machine are guaranteed to match the way
448+
/// `String` splits its contents into `Character` values.
449+
@available(SwiftStdlib 5.8, *)
450+
public // SPI(Foundation) FIXME: We need API for this
451+
struct _CharacterRecognizer {
452+
internal var _previous: Unicode.Scalar
453+
internal var _state: _GraphemeBreakingState
454+
455+
/// Returns a non-nil value if it can be determined whether there is a
456+
/// grapheme break between `scalar1` and `scalar2` without knowing anything
457+
/// about the scalars that precede `scalar1`. This can optionally be used as
458+
/// a fast (but incomplete) test before spinning up a full state machine
459+
/// session.
460+
@_effects(releasenone)
461+
public static func quickBreak(
462+
between scalar1: Unicode.Scalar,
463+
and scalar2: Unicode.Scalar
464+
) -> Bool? {
465+
if scalar1.value == 0xD, scalar2.value == 0xA {
466+
return false
467+
}
468+
if _hasGraphemeBreakBetween(scalar1, scalar2) {
469+
return true
470+
}
471+
return nil
472+
}
473+
474+
/// Initialize a new character recognizer at the _start of text_ (sot)
475+
/// position.
476+
///
477+
/// The resulting state machine will report a grapheme break on the
478+
/// first scalar that is fed to it.
479+
public init() {
480+
_state = _GraphemeBreakingState()
481+
// To avoid having to handle the empty case specially, we use NUL as the
482+
// placeholder before the first scalar. NUL is a control character, so per
483+
// rule GB5, it will induce an unconditional grapheme break before the
484+
// first actual scalar, emulating GB1.
485+
_previous = Unicode.Scalar(0 as UInt8)
486+
}
487+
488+
/// Feeds the next scalar to the state machine, returning a Boolean value
489+
/// indicating whether it starts a new extended grapheme cluster.
490+
///
491+
/// This method will always report a break the first time it is called
492+
/// on a newly initialized recognizer.
493+
///
494+
/// The state machine does not carry information across character
495+
/// boundaries. I.e., if this method returns true, then `self` after the
496+
/// call is equivalent to feeding the same scalar to a newly initialized
497+
/// recognizer instance.
498+
@_effects(releasenone)
499+
public mutating func hasBreak(
500+
before next: Unicode.Scalar
501+
) -> Bool {
502+
let r = _state.shouldBreak(between: _previous, and: next)
503+
if r {
504+
_state = _GraphemeBreakingState()
505+
}
506+
_previous = next
507+
return r
508+
}
509+
510+
/// Decode the scalars in the given UTF-8 buffer and feed them to the
511+
/// recognizer up to and including the scalar following the first grapheme
512+
/// break. If the buffer contains a grapheme break, then this function
513+
/// returns the index range of the scalar that follows the first one;
514+
/// otherwise it returns `nil`.
515+
///
516+
/// On return, the state of the recognizer is updated to reflect the scalars
517+
/// up to and including the returned one. You can detect additional grapheme
518+
/// breaks by feeding the recognizer subsequent data.
519+
///
520+
/// - Parameter buffer: A buffer containing valid UTF-8 data, starting and
521+
/// ending on Unicode scalar boundaries.
522+
///
523+
/// - Parameter start: A valid index into `buffer`, addressing the first
524+
/// code unit of a UTF-8 scalar in the buffer, or the end.
525+
///
526+
/// - Returns: The index range of the scalar that follows the first grapheme
527+
/// break in the buffer, if there is one. If the buffer contains no
528+
/// grapheme breaks, then this function returns `nil`.
529+
///
530+
/// - Warning: This function does not validate that the buffer contains
531+
/// valid UTF-8 data; its behavior is undefined if given invalid input.
532+
@_effects(releasenone)
533+
public mutating func _firstBreak(
534+
inUncheckedUnsafeUTF8Buffer buffer: UnsafeBufferPointer<UInt8>,
535+
startingAt start: Int = 0
536+
) -> Range<Int>? {
537+
var i = start
538+
while i < buffer.endIndex {
539+
let (next, n) = _decodeScalar(buffer, startingAt: i)
540+
if hasBreak(before: next) {
541+
return Range(_uncheckedBounds: (i, i &+ n))
542+
}
543+
i &+= n
544+
}
545+
return nil
546+
}
547+
}
548+
}
549+
439550
extension _StringGuts {
440551
// Returns the stride of the grapheme cluster starting at offset `index`,
441552
// assuming it is on a grapheme cluster boundary.
@@ -459,7 +570,7 @@ extension _StringGuts {
459570

460571
while true {
461572
guard let (scalar2, nextIndex) = nextScalar(index) else { break }
462-
if shouldBreak(between: scalar, and: scalar2, at: index, with: &state) {
573+
if state.shouldBreak(between: scalar, and: scalar2) {
463574
break
464575
}
465576
index = nextIndex
@@ -505,7 +616,7 @@ extension _StringGuts {
505616
}
506617
}
507618

508-
extension _StringGuts {
619+
extension _GraphemeBreakingState {
509620
// Return true if there is an extended grapheme cluster boundary between two
510621
// scalars, based on state information previously collected about preceding
511622
// scalars.
@@ -517,11 +628,9 @@ extension _StringGuts {
517628
//
518629
// This is based on the Unicode Annex #29 for [Grapheme Cluster Boundary
519630
// Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
520-
internal func shouldBreak(
631+
internal mutating func shouldBreak(
521632
between scalar1: Unicode.Scalar,
522-
and scalar2: Unicode.Scalar,
523-
at index: Int,
524-
with state: inout _GraphemeBreakingState
633+
and scalar2: Unicode.Scalar
525634
) -> Bool {
526635
// GB3
527636
if scalar1.value == 0xD, scalar2.value == 0xA {
@@ -545,8 +654,8 @@ extension _StringGuts {
545654
var enterIndicSequence = false
546655

547656
defer {
548-
state.isInEmojiSequence = enterEmojiSequence
549-
state.isInIndicSequence = enterIndicSequence
657+
self.isInEmojiSequence = enterEmojiSequence
658+
self.isInIndicSequence = enterIndicSequence
550659
}
551660

552661
switch (x, y) {
@@ -591,14 +700,14 @@ extension _StringGuts {
591700
// continue the grapheme cluster by combining more scalars later. If we're
592701
// not currently in an emoji sequence, but our lhs scalar is a pictograph,
593702
// then that's a signal that it's the start of an emoji sequence.
594-
if state.isInEmojiSequence || x == .extendedPictographic {
703+
if self.isInEmojiSequence || x == .extendedPictographic {
595704
enterEmojiSequence = true
596705
}
597706

598707
// If we're currently in an indic sequence (or if our lhs is a linking
599708
// consonant), then this check and everything underneath ensures that
600709
// we continue being in one and may check if this extend is a Virama.
601-
if state.isInIndicSequence || scalar1._isLinkingConsonant {
710+
if self.isInIndicSequence || scalar1._isLinkingConsonant {
602711
if y == .extend {
603712
let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300)
604713

@@ -611,7 +720,7 @@ extension _StringGuts {
611720
enterIndicSequence = true
612721

613722
if scalar2._isVirama {
614-
state.hasSeenVirama = true
723+
self.hasSeenVirama = true
615724
}
616725
}
617726

@@ -627,32 +736,34 @@ extension _StringGuts {
627736

628737
// GB11
629738
case (.zwj, .extendedPictographic):
630-
return !state.isInEmojiSequence
739+
return !self.isInEmojiSequence
631740

632741
// GB12 & GB13
633742
case (.regionalIndicator, .regionalIndicator):
634743
defer {
635-
state.shouldBreakRI.toggle()
744+
self.shouldBreakRI.toggle()
636745
}
637746

638-
return state.shouldBreakRI
747+
return self.shouldBreakRI
639748

640749
// GB999
641750
default:
642751
// GB9c
643752
if
644-
state.isInIndicSequence,
645-
state.hasSeenVirama,
753+
self.isInIndicSequence,
754+
self.hasSeenVirama,
646755
scalar2._isLinkingConsonant
647756
{
648-
state.hasSeenVirama = false
757+
self.hasSeenVirama = false
649758
return false
650759
}
651760

652761
return true
653762
}
654763
}
764+
}
655765

766+
extension _StringGuts {
656767
// Return true if there is an extended grapheme cluster boundary between two
657768
// scalars, with no previous knowledge about preceding scalars.
658769
//

0 commit comments

Comments
 (0)