2
2
//
3
3
// This source file is part of the Swift.org open source project
4
4
//
5
- // Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
5
+ // Copyright (c) 2014 - 2023 Apple Inc. and the Swift project authors
6
6
// Licensed under Apache License v2.0 with Runtime Library Exception
7
7
//
8
8
// See https://swift.org/LICENSE.txt for license information
@@ -436,6 +436,117 @@ internal struct _GraphemeBreakingState {
436
436
var shouldBreakRI = false
437
437
}
438
438
439
+ extension Unicode {
440
+ /// A state machine for recognizing character (i.e., extended grapheme
441
+ /// cluster) boundaries in an arbitrary series of Unicode scalars.
442
+ ///
443
+ /// To detect grapheme breaks in a sequence of Unicode scalars, feed each of
444
+ /// them to the `hasBreak(before:)` method. The method returns true if the
445
+ /// sequence has a grapheme break preceding the given value.
446
+ ///
447
+ /// The results produced by this state machine are guaranteed to match the way
448
+ /// `String` splits its contents into `Character` values.
449
+ @available ( SwiftStdlib 5 . 8 , * )
450
+ public // SPI(Foundation) FIXME: We need API for this
451
+ struct _CharacterRecognizer {
452
+ internal var _previous : Unicode . Scalar
453
+ internal var _state : _GraphemeBreakingState
454
+
455
+ /// Returns a non-nil value if it can be determined whether there is a
456
+ /// grapheme break between `scalar1` and `scalar2` without knowing anything
457
+ /// about the scalars that precede `scalar1`. This can optionally be used as
458
+ /// a fast (but incomplete) test before spinning up a full state machine
459
+ /// session.
460
+ @_effects ( releasenone)
461
+ public static func quickBreak(
462
+ between scalar1: Unicode . Scalar ,
463
+ and scalar2: Unicode . Scalar
464
+ ) -> Bool ? {
465
+ if scalar1. value == 0xD , scalar2. value == 0xA {
466
+ return false
467
+ }
468
+ if _hasGraphemeBreakBetween ( scalar1, scalar2) {
469
+ return true
470
+ }
471
+ return nil
472
+ }
473
+
474
+ /// Initialize a new character recognizer at the _start of text_ (sot)
475
+ /// position.
476
+ ///
477
+ /// The resulting state machine will report a grapheme break on the
478
+ /// first scalar that is fed to it.
479
+ public init ( ) {
480
+ _state = _GraphemeBreakingState ( )
481
+ // To avoid having to handle the empty case specially, we use NUL as the
482
+ // placeholder before the first scalar. NUL is a control character, so per
483
+ // rule GB5, it will induce an unconditional grapheme break before the
484
+ // first actual scalar, emulating GB1.
485
+ _previous = Unicode . Scalar ( 0 as UInt8 )
486
+ }
487
+
488
+ /// Feeds the next scalar to the state machine, returning a Boolean value
489
+ /// indicating whether it starts a new extended grapheme cluster.
490
+ ///
491
+ /// This method will always report a break the first time it is called
492
+ /// on a newly initialized recognizer.
493
+ ///
494
+ /// The state machine does not carry information across character
495
+ /// boundaries. I.e., if this method returns true, then `self` after the
496
+ /// call is equivalent to feeding the same scalar to a newly initialized
497
+ /// recognizer instance.
498
+ @_effects ( releasenone)
499
+ public mutating func hasBreak(
500
+ before next: Unicode . Scalar
501
+ ) -> Bool {
502
+ let r = _state. shouldBreak ( between: _previous, and: next)
503
+ if r {
504
+ _state = _GraphemeBreakingState ( )
505
+ }
506
+ _previous = next
507
+ return r
508
+ }
509
+
510
+ /// Decode the scalars in the given UTF-8 buffer and feed them to the
511
+ /// recognizer up to and including the scalar following the first grapheme
512
+ /// break. If the buffer contains a grapheme break, then this function
513
+ /// returns the index range of the scalar that follows the first one;
514
+ /// otherwise it returns `nil`.
515
+ ///
516
+ /// On return, the state of the recognizer is updated to reflect the scalars
517
+ /// up to and including the returned one. You can detect additional grapheme
518
+ /// breaks by feeding the recognizer subsequent data.
519
+ ///
520
+ /// - Parameter buffer: A buffer containing valid UTF-8 data, starting and
521
+ /// ending on Unicode scalar boundaries.
522
+ ///
523
+ /// - Parameter start: A valid index into `buffer`, addressing the first
524
+ /// code unit of a UTF-8 scalar in the buffer, or the end.
525
+ ///
526
+ /// - Returns: The index range of the scalar that follows the first grapheme
527
+ /// break in the buffer, if there is one. If the buffer contains no
528
+ /// grapheme breaks, then this function returns `nil`.
529
+ ///
530
+ /// - Warning: This function does not validate that the buffer contains
531
+ /// valid UTF-8 data; its behavior is undefined if given invalid input.
532
+ @_effects ( releasenone)
533
+ public mutating func _firstBreak(
534
+ inUncheckedUnsafeUTF8Buffer buffer: UnsafeBufferPointer < UInt8 > ,
535
+ startingAt start: Int = 0
536
+ ) -> Range < Int > ? {
537
+ var i = start
538
+ while i < buffer. endIndex {
539
+ let ( next, n) = _decodeScalar ( buffer, startingAt: i)
540
+ if hasBreak ( before: next) {
541
+ return Range ( _uncheckedBounds: ( i, i &+ n) )
542
+ }
543
+ i &+= n
544
+ }
545
+ return nil
546
+ }
547
+ }
548
+ }
549
+
439
550
extension _StringGuts {
440
551
// Returns the stride of the grapheme cluster starting at offset `index`,
441
552
// assuming it is on a grapheme cluster boundary.
@@ -459,7 +570,7 @@ extension _StringGuts {
459
570
460
571
while true {
461
572
guard let ( scalar2, nextIndex) = nextScalar ( index) else { break }
462
- if shouldBreak ( between: scalar, and: scalar2, at : index , with : & state ) {
573
+ if state . shouldBreak ( between: scalar, and: scalar2) {
463
574
break
464
575
}
465
576
index = nextIndex
@@ -505,7 +616,7 @@ extension _StringGuts {
505
616
}
506
617
}
507
618
508
- extension _StringGuts {
619
+ extension _GraphemeBreakingState {
509
620
// Return true if there is an extended grapheme cluster boundary between two
510
621
// scalars, based on state information previously collected about preceding
511
622
// scalars.
@@ -517,11 +628,9 @@ extension _StringGuts {
517
628
//
518
629
// This is based on the Unicode Annex #29 for [Grapheme Cluster Boundary
519
630
// Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
520
- internal func shouldBreak(
631
+ internal mutating func shouldBreak(
521
632
between scalar1: Unicode . Scalar ,
522
- and scalar2: Unicode . Scalar ,
523
- at index: Int ,
524
- with state: inout _GraphemeBreakingState
633
+ and scalar2: Unicode . Scalar
525
634
) -> Bool {
526
635
// GB3
527
636
if scalar1. value == 0xD , scalar2. value == 0xA {
@@ -545,8 +654,8 @@ extension _StringGuts {
545
654
var enterIndicSequence = false
546
655
547
656
defer {
548
- state . isInEmojiSequence = enterEmojiSequence
549
- state . isInIndicSequence = enterIndicSequence
657
+ self . isInEmojiSequence = enterEmojiSequence
658
+ self . isInIndicSequence = enterIndicSequence
550
659
}
551
660
552
661
switch ( x, y) {
@@ -591,14 +700,14 @@ extension _StringGuts {
591
700
// continue the grapheme cluster by combining more scalars later. If we're
592
701
// not currently in an emoji sequence, but our lhs scalar is a pictograph,
593
702
// then that's a signal that it's the start of an emoji sequence.
594
- if state . isInEmojiSequence || x == . extendedPictographic {
703
+ if self . isInEmojiSequence || x == . extendedPictographic {
595
704
enterEmojiSequence = true
596
705
}
597
706
598
707
// If we're currently in an indic sequence (or if our lhs is a linking
599
708
// consonant), then this check and everything underneath ensures that
600
709
// we continue being in one and may check if this extend is a Virama.
601
- if state . isInIndicSequence || scalar1. _isLinkingConsonant {
710
+ if self . isInIndicSequence || scalar1. _isLinkingConsonant {
602
711
if y == . extend {
603
712
let extendNormData = Unicode . _NormData ( scalar2, fastUpperbound: 0x300 )
604
713
@@ -611,7 +720,7 @@ extension _StringGuts {
611
720
enterIndicSequence = true
612
721
613
722
if scalar2. _isVirama {
614
- state . hasSeenVirama = true
723
+ self . hasSeenVirama = true
615
724
}
616
725
}
617
726
@@ -627,32 +736,34 @@ extension _StringGuts {
627
736
628
737
// GB11
629
738
case ( . zwj, . extendedPictographic) :
630
- return !state . isInEmojiSequence
739
+ return !self . isInEmojiSequence
631
740
632
741
// GB12 & GB13
633
742
case ( . regionalIndicator, . regionalIndicator) :
634
743
defer {
635
- state . shouldBreakRI. toggle ( )
744
+ self . shouldBreakRI. toggle ( )
636
745
}
637
746
638
- return state . shouldBreakRI
747
+ return self . shouldBreakRI
639
748
640
749
// GB999
641
750
default :
642
751
// GB9c
643
752
if
644
- state . isInIndicSequence,
645
- state . hasSeenVirama,
753
+ self . isInIndicSequence,
754
+ self . hasSeenVirama,
646
755
scalar2. _isLinkingConsonant
647
756
{
648
- state . hasSeenVirama = false
757
+ self . hasSeenVirama = false
649
758
return false
650
759
}
651
760
652
761
return true
653
762
}
654
763
}
764
+ }
655
765
766
+ extension _StringGuts {
656
767
// Return true if there is an extended grapheme cluster boundary between two
657
768
// scalars, with no previous knowledge about preceding scalars.
658
769
//
0 commit comments