diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs index c08caa7b93ed2..59b45b33fba9b 100644 --- a/library/alloc/src/lib.rs +++ b/library/alloc/src/lib.rs @@ -144,6 +144,7 @@ #![feature(unicode_internals)] #![feature(unsize)] #![feature(std_internals)] +#![feature(unicode_converter)] // // Language features: #![feature(allocator_internals)] diff --git a/library/alloc/tests/lib.rs b/library/alloc/tests/lib.rs index 367cdcdcc061c..f102d0cf9e1ad 100644 --- a/library/alloc/tests/lib.rs +++ b/library/alloc/tests/lib.rs @@ -44,6 +44,7 @@ #![feature(bench_black_box)] #![feature(strict_provenance)] #![feature(once_cell)] +#![feature(unicode_converter)] use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index 7379569dd68fe..f3c6f9030a2a2 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -1794,6 +1794,108 @@ fn to_uppercase() { assert_eq!("aéDžßfiᾀ".to_uppercase(), "AÉDŽSSFIἈΙ"); } +const CHARS_CONVERT_STRINGS: &[&str; 7] = &[ + "aBcD", + "ὀδυσσεύς", + "ὈΔΥΣΣΕΎΣ", + "aößü💩στιγμαςDžfiᾀ", + "AÖßÜ💩ΣΤΙΓΜΑΣDžfiİ", + "İİİİİİİİİİİİİİİİİİİİİİİİ", + "i̇i̇i̇i̇i̇i̇i̇i̇i̇i̇i̇i̇i̇i̇i̇i̇i̇i̇i̇i̇i̇i̇i̇i̇", +]; + +// Run n times .next() then .next_back() +fn chars_fwdback>(mut iter: I, n: usize) -> String { + let mut buf1 = String::new(); + let mut buf2 = String::new(); + for _ in 0..n { + if let Some(c) = iter.next() { + buf1.push(c); + } else { + break; + } + } + while let Some(c) = iter.next_back() { + buf2.push(c); + } + for c in buf2.chars().rev() { + buf1.push(c); + } + buf1 +} + +// Run n times .next_back() then .next() +fn chars_backfwd>(mut iter: I, n: usize) -> String { + let mut buf1 = String::new(); + let mut buf2 = String::new(); + for _ in 0..n { + if let Some(c) = iter.next_back() { + buf2.push(c); + } else { + break; + } + } + while let Some(c) = iter.next() { + buf1.push(c); + } + for c in buf2.chars().rev() { + buf1.push(c); + } + buf1 +} + +#[test] +fn test_chars_uppercase() { + for s in CHARS_CONVERT_STRINGS { + let exp = s.to_uppercase(); + assert_eq!(s.chars_uppercase().collect::(), exp); + for i in 0..s.len() { + assert_eq!((i, &chars_fwdback(s.chars_uppercase(), i)), (i, &exp)); + assert_eq!((i, &chars_backfwd(s.chars_uppercase(), i)), (i, &exp)); + } + } +} + +#[test] +fn test_chars_lowercase() { + for s in CHARS_CONVERT_STRINGS { + let exp = s.to_lowercase(); + assert_eq!(s.chars_lowercase().collect::(), exp); + for i in 0..s.len() { + assert_eq!((i, &chars_fwdback(s.chars_lowercase(), i)), (i, &exp)); + assert_eq!((i, &chars_backfwd(s.chars_lowercase(), i)), (i, &exp)); + } + } +} + +#[test] +fn test_chars_uppercase_clone_debug() { + let mut iter = "abc".chars_uppercase(); + assert_eq!(iter.next(), Some('A')); + assert_eq!(iter.next(), Some('B')); + let mut iterc = iter.clone(); + assert_eq!(&format!("{:?}", &iterc), "CharsUppercase(['C'])"); + assert_eq!(iter.next(), Some('C')); + assert_eq!(iter.next(), None); + assert_eq!(iterc.clone().last(), Some('C')); + assert_eq!(iterc.next(), Some('C')); + assert_eq!(iterc.next(), None); +} + +#[test] +fn test_chars_lowercase_clone_debug() { + let mut iter = "ABC".chars_lowercase(); + assert_eq!(iter.next(), Some('a')); + assert_eq!(iter.next(), Some('b')); + let mut iterc = iter.clone(); + assert_eq!(&format!("{:?}", &iterc), "CharsLowercase(['c'])"); + assert_eq!(iter.next(), Some('c')); + assert_eq!(iter.next(), None); + assert_eq!(iterc.clone().last(), Some('c')); + assert_eq!(iterc.next(), Some('c')); + assert_eq!(iterc.next(), None); +} + #[test] fn test_into_string() { // The only way to acquire a Box in the first place is through a String, so just diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs index 093c7d298734a..6f8fe37fd38dc 100644 --- a/library/core/src/lib.rs +++ b/library/core/src/lib.rs @@ -156,6 +156,7 @@ #![feature(const_slice_from_ref)] #![feature(const_slice_index)] #![feature(const_is_char_boundary)] +#![feature(unicode_converter)] // // Language features: #![feature(abi_unadjusted)] diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index 24083ee6af44f..a0f5f32c40924 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -8,6 +8,7 @@ use crate::iter::{TrustedRandomAccess, TrustedRandomAccessNoCoerce}; use crate::ops::Try; use crate::option; use crate::slice::{self, Split as SliceSplit}; +use crate::unicode::conversions; use super::from_utf8_unchecked; use super::pattern::Pattern; @@ -1497,3 +1498,346 @@ macro_rules! escape_types_impls { } escape_types_impls!(EscapeDebug, EscapeDefault, EscapeUnicode); + +/// Internal trait for Unicode conversions that potentially +/// need context and can expand one char to several. +/// +/// Default implementation is pass-through, no conversion is done, +/// with `is_simple = is_ascii`. +trait UnicodeConverter { + /// Convert without context nor expansion. + fn convert_simple(&self, c: char) -> char; + + /// Convert with context, char can expand. + fn convert_complex(&self, c: char, from_str: &str, from_idx: usize) -> [char; 3]; + + /// Can conversion be done with `convert_simple` that does not expand? + /// + /// If true, use `convert_simple`, otherwise `convert_complex`. + #[inline] + fn is_simple(&self, c: char) -> bool { + c.is_ascii() + } +} + +/// Convert chars to upper case. +#[derive(Clone, Debug)] +struct UppercaseConverter; + +impl UnicodeConverter for UppercaseConverter { + #[inline] + fn convert_simple(&self, c: char) -> char { + c.to_ascii_uppercase() + } + + #[inline] + fn convert_complex(&self, c: char, _from_str: &str, _idx: usize) -> [char; 3] { + conversions::to_upper(c) + } +} + +/// Convert chars to lower case. +#[derive(Clone, Debug)] +struct LowercaseConverter; + +impl UnicodeConverter for LowercaseConverter { + #[inline] + fn convert_simple(&self, c: char) -> char { + c.to_ascii_lowercase() + } + + #[inline] + fn convert_complex(&self, c: char, from_str: &str, idx: usize) -> [char; 3] { + // Σ maps to σ, except at the end of a word where it maps to ς. + // This is the only conditional (contextual) but language-independent mapping + // in `SpecialCasing.txt`, + // so hard-code it rather than have a generic "condition" mechanism. + // See https://github.com/rust-lang/rust/issues/26035 + if c == 'Σ' { map_uppercase_sigma(from_str, idx) } else { conversions::to_lower(c) } + } +} + +#[cold] +#[inline(never)] +fn map_uppercase_sigma(from: &str, i: usize) -> [char; 3] { + // See https://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992 + // for the definition of `Final_Sigma`. + debug_assert!('Σ'.len_utf8() == 2); + let is_word_final = case_ignoreable_then_cased(from[..i].chars().rev()) + && !case_ignoreable_then_cased(from[i + 2..].chars()); + if is_word_final { ['ς', '\0', '\0'] } else { ['σ', '\0', '\0'] } +} + +fn case_ignoreable_then_cased>(iter: I) -> bool { + use crate::unicode::{Case_Ignorable, Cased}; + match iter.skip_while(|&c| Case_Ignorable(c)).next() { + Some(c) => Cased(c), + None => false, + } +} + +/// Iterator that uses UnicodeConverter to convert chars. +/// +/// Allows expansion and can provide full context. +/// +/// Needs internal 2-char buffer in both forward and backward direction +/// to cache expanded chars. +#[derive(Clone)] +struct UnicodeIterator<'a, C> +where + C: UnicodeConverter, +{ + // data source + iter: CharIndices<'a>, + // buffer for .next() + fwd: [Option; 2], + // buffer for .next_back(), has reverse order + bwd: [Option; 2], + // keep original str for full context + orig_str: &'a str, + // state for converter + converter: C, +} + +impl<'a, C> UnicodeIterator<'a, C> +where + C: UnicodeConverter, +{ + fn new(s: &'a str, converter: C) -> Self { + UnicodeIterator { + iter: s.char_indices(), + fwd: [None, None], + bwd: [None, None], + orig_str: s, + converter, + } + } +} + +impl Iterator for UnicodeIterator<'_, C> +where + C: UnicodeConverter, +{ + type Item = char; + + fn next(&mut self) -> Option { + match self.fwd[0] { + Some(c) => { + self.fwd = [self.fwd[1], None]; + Some(c) + } + None => match self.iter.next() { + Some((idx, c)) => { + if self.converter.is_simple(c) { + Some(self.converter.convert_simple(c)) + } else { + match self.converter.convert_complex(c, self.orig_str, idx) { + [a, '\0', _] => Some(a), + [a, b, '\0'] => { + self.fwd[0] = Some(b); + Some(a) + } + [a, b, c] => { + self.fwd = [Some(b), Some(c)]; + Some(a) + } + } + } + } + None => match self.bwd { + [None, _] => None, + [Some(a), None] => { + self.bwd[0] = None; + Some(a) + } + [_, Some(b)] => { + self.bwd[1] = None; + Some(b) + } + }, + }, + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (low, high) = self.iter.size_hint(); + (low, high.and_then(|n| n.checked_mul(3)).and_then(|n| n.checked_add(4))) + } + + #[inline] + fn last(mut self) -> Option { + self.next_back() + } +} + +impl DoubleEndedIterator for UnicodeIterator<'_, C> +where + C: UnicodeConverter, +{ + fn next_back(&mut self) -> Option { + match self.bwd[0] { + Some(c) => { + self.bwd = [self.bwd[1], None]; + Some(c) + } + None => match self.iter.next_back() { + Some((idx, c)) => { + if self.converter.is_simple(c) { + Some(self.converter.convert_simple(c)) + } else { + match self.converter.convert_complex(c, self.orig_str, idx) { + [a, '\0', _] => Some(a), + [a, b, '\0'] => { + self.bwd[0] = Some(a); + Some(b) + } + [a, b, c] => { + self.bwd = [Some(b), Some(a)]; + Some(c) + } + } + } + } + None => match self.fwd { + [None, _] => None, + [Some(a), None] => { + self.fwd[0] = None; + Some(a) + } + [_, Some(b)] => { + self.fwd[1] = None; + Some(b) + } + }, + }, + } + } +} + +impl fmt::Debug for UnicodeIterator<'_, C> +where + C: UnicodeConverter + Clone, +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_list().entries(self.clone()).finish() + } +} + +impl FusedIterator for UnicodeIterator<'_, C> where C: UnicodeConverter {} + +/// An iterator over the uppercase chars of a string slice. +/// +/// This struct is created by the [`chars_uppercase`] method on [`str`]. +/// See its documentation for more information. +/// +/// [`chars_uppercase`]: str::chars_uppercase +#[unstable(feature = "unicode_converter", issue = "none")] +#[must_use = "iterators are lazy and do nothing unless consumed"] +#[derive(Clone)] +pub struct CharsUppercase<'a> { + inner: UnicodeIterator<'a, UppercaseConverter>, +} + +impl<'a> CharsUppercase<'a> { + #[inline] + pub(super) fn new(val: &'a str) -> CharsUppercase<'a> { + Self { inner: UnicodeIterator::new(val, UppercaseConverter) } + } +} + +#[unstable(feature = "unicode_converter", issue = "none")] +impl<'a> Iterator for CharsUppercase<'a> { + type Item = char; + + #[inline] + fn next(&mut self) -> Option { + self.inner.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } + + #[inline] + fn last(self) -> Option { + self.inner.last() + } +} + +#[unstable(feature = "unicode_converter", issue = "none")] +impl DoubleEndedIterator for CharsUppercase<'_> { + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next_back() + } +} + +#[unstable(feature = "unicode_converter", issue = "none")] +impl FusedIterator for CharsUppercase<'_> {} + +#[unstable(feature = "unicode_converter", issue = "none")] +impl fmt::Debug for CharsUppercase<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "CharsUppercase({:?})", &self.inner) + } +} + +/// An iterator over the lowercase chars of a string slice. +/// +/// This struct is created by the [`chars_lowercase`] method on [`str`]. +/// See its documentation for more information. +/// +/// [`chars_lowercase`]: str::chars_lowercase +#[unstable(feature = "unicode_converter", issue = "none")] +#[must_use = "iterators are lazy and do nothing unless consumed"] +#[derive(Clone)] +pub struct CharsLowercase<'a> { + inner: UnicodeIterator<'a, LowercaseConverter>, +} + +impl<'a> CharsLowercase<'a> { + #[inline] + pub(super) fn new(val: &'a str) -> CharsLowercase<'a> { + CharsLowercase { inner: UnicodeIterator::new(val, LowercaseConverter) } + } +} + +#[unstable(feature = "unicode_converter", issue = "none")] +impl<'a> Iterator for CharsLowercase<'a> { + type Item = char; + + #[inline] + fn next(&mut self) -> Option { + self.inner.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } + + #[inline] + fn last(self) -> Option { + self.inner.last() + } +} + +#[unstable(feature = "unicode_converter", issue = "none")] +impl DoubleEndedIterator for CharsLowercase<'_> { + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next_back() + } +} + +#[unstable(feature = "unicode_converter", issue = "none")] +impl FusedIterator for CharsLowercase<'_> {} + +#[unstable(feature = "unicode_converter", issue = "none")] +impl fmt::Debug for CharsLowercase<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "CharsLowercase({:?})", &self.inner) + } +} diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index c4f2e283eb3bc..0865099c806a3 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -69,6 +69,9 @@ pub use iter::SplitAsciiWhitespace; #[stable(feature = "split_inclusive", since = "1.51.0")] pub use iter::SplitInclusive; +#[unstable(feature = "unicode_converter", issue = "none")] +pub use iter::{CharsLowercase, CharsUppercase}; + #[unstable(feature = "str_internals", issue = "none")] pub use validations::{next_code_point, utf8_char_width}; @@ -838,6 +841,102 @@ impl str { CharIndices { front_offset: 0, iter: self.chars() } } + /// Returns an iterator over the [`char`]s of a string slice, + /// converted to uppercase. + /// + /// It is guaranteed to match the output of [`str::to_uppercase`] + /// as it uses same context-aware conversion method. + /// + /// It is not guarateed to match [`char::to_uppercase`] + /// as that API is not context-aware. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(unicode_converter)] + /// + /// let input = "abc"; + /// + /// let mut iter = input.chars_uppercase(); + /// + /// assert_eq!(Some('A'), iter.next()); + /// assert_eq!(Some('B'), iter.next()); + /// assert_eq!(Some('C'), iter.next()); + /// assert_eq!(None, iter.next()); + /// ``` + /// + /// Complex case: + /// + /// ``` + /// #![feature(unicode_converter)] + /// + /// let input = "ὒ"; + /// + /// let mut iter = input.chars_uppercase(); + /// + /// assert_eq!(Some('Υ'), iter.next()); + /// assert_eq!(Some('\u{300}'), iter.next_back()); + /// assert_eq!(Some('\u{313}'), iter.next()); + /// assert_eq!(None, iter.next_back()); + /// ``` + /// + /// [`char`]: prim@char + /// [`str::to_uppercase`]: https://doc.rust-lang.org/std/primitive.str.html#method.to_uppercase + #[unstable(feature = "unicode_converter", issue = "none")] + #[inline] + pub fn chars_uppercase(&self) -> CharsUppercase<'_> { + CharsUppercase::new(self) + } + + /// Returns an iterator over the [`char`]s of a string slice, + /// converted to lowercase. + /// + /// It is guaranteed to match the output of [`str::to_lowercase`] + /// as it uses same context-aware conversion method. + /// + /// It is not guarateed to match [`char::to_lowercase`] + /// as that API is not context-aware. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(unicode_converter)] + /// + /// let input = "ABC"; + /// + /// let mut iter = input.chars_lowercase(); + /// + /// assert_eq!(Some('a'), iter.next()); + /// assert_eq!(Some('b'), iter.next()); + /// assert_eq!(Some('c'), iter.next()); + /// assert_eq!(None, iter.next()); + /// ``` + /// + /// Complex situation: + /// + /// ``` + /// #![feature(unicode_converter)] + /// + /// let input = "ὈΔΥΣΣΕΎΣ"; + /// + /// let mut iter = input.chars_lowercase(); + /// + /// assert_eq!(Some('ς'), iter.next_back()); + /// assert_eq!(Some('σ'), input.chars().flat_map(|c| c.to_lowercase()).last()) + /// ``` + /// + /// [`char`]: prim@char + /// [`str::to_lowercase`]: https://doc.rust-lang.org/std/primitive.str.html#method.to_lowercase + #[unstable(feature = "unicode_converter", issue = "none")] + #[inline] + pub fn chars_lowercase(&self) -> CharsLowercase<'_> { + CharsLowercase::new(self) + } + /// An iterator over the bytes of a string slice. /// /// As a string slice consists of a sequence of bytes, we can iterate diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index 7da9f248c877a..ff2c0bba3e70e 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -291,6 +291,7 @@ #![feature(std_internals)] #![feature(str_internals)] #![feature(strict_provenance)] +#![feature(unicode_converter)] // // Library features (alloc): #![feature(alloc_layout_extra)] diff --git a/src/doc/unstable-book/src/library-features/unicode-converter.md b/src/doc/unstable-book/src/library-features/unicode-converter.md new file mode 100644 index 0000000000000..95347716d02b6 --- /dev/null +++ b/src/doc/unstable-book/src/library-features/unicode-converter.md @@ -0,0 +1,9 @@ +# `unicode_converter` + +No tracking issue + +------------------------ + +Add `str::chars_uppercase` and `str::chars_lowercase` iterators. + +They are based on internal iterating, context-aware unicode converter API.