diff --git a/CHANGELOG.md b/CHANGELOG.md index bb1917575e37..39ef4a3ec010 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2663,6 +2663,7 @@ Released 2018-09-13 [`missing_safety_doc`]: https://rust-lang.github.io/rust-clippy/master/index.html#missing_safety_doc [`mistyped_literal_suffixes`]: https://rust-lang.github.io/rust-clippy/master/index.html#mistyped_literal_suffixes [`mixed_case_hex_literals`]: https://rust-lang.github.io/rust-clippy/master/index.html#mixed_case_hex_literals +[`mixed_locale_idents`]: https://rust-lang.github.io/rust-clippy/master/index.html#mixed_locale_idents [`module_inception`]: https://rust-lang.github.io/rust-clippy/master/index.html#module_inception [`module_name_repetitions`]: https://rust-lang.github.io/rust-clippy/master/index.html#module_name_repetitions [`modulo_arithmetic`]: https://rust-lang.github.io/rust-clippy/master/index.html#modulo_arithmetic diff --git a/clippy_lints/Cargo.toml b/clippy_lints/Cargo.toml index d3d12062f077..df456aec4b06 100644 --- a/clippy_lints/Cargo.toml +++ b/clippy_lints/Cargo.toml @@ -23,6 +23,8 @@ serde = { version = "1.0", features = ["derive"] } serde_json = { version = "1.0", optional = true } toml = "0.5.3" unicode-normalization = "0.1" +unicode-script = { version = "0.5.1", default-features = false } +unicode-security = { version = "0.0.5", default-features = false } semver = "0.11" rustc-semver = "1.1.0" # NOTE: cargo requires serde feat in its url dep diff --git a/clippy_lints/src/lib.rs b/clippy_lints/src/lib.rs index f5082468a77c..14ebe7b47325 100644 --- a/clippy_lints/src/lib.rs +++ b/clippy_lints/src/lib.rs @@ -268,6 +268,7 @@ mod misc_early; mod missing_const_for_fn; mod missing_doc; mod missing_inline; +mod mixed_locale_idents; mod modulo_arithmetic; mod multiple_crate_versions; mod mut_key; @@ -813,6 +814,7 @@ pub fn register_plugins(store: &mut rustc_lint::LintStore, sess: &Session, conf: missing_const_for_fn::MISSING_CONST_FOR_FN, missing_doc::MISSING_DOCS_IN_PRIVATE_ITEMS, missing_inline::MISSING_INLINE_IN_PUBLIC_ITEMS, + mixed_locale_idents::MIXED_LOCALE_IDENTS, modulo_arithmetic::MODULO_ARITHMETIC, multiple_crate_versions::MULTIPLE_CRATE_VERSIONS, mut_key::MUTABLE_KEY_TYPE, @@ -1341,6 +1343,7 @@ pub fn register_plugins(store: &mut rustc_lint::LintStore, sess: &Session, conf: LintId::of(misc_early::REDUNDANT_PATTERN), LintId::of(misc_early::UNNEEDED_WILDCARD_PATTERN), LintId::of(misc_early::ZERO_PREFIXED_LITERAL), + LintId::of(mixed_locale_idents::MIXED_LOCALE_IDENTS), LintId::of(mut_key::MUTABLE_KEY_TYPE), LintId::of(mut_mutex_lock::MUT_MUTEX_LOCK), LintId::of(mut_reference::UNNECESSARY_MUT_PASSED), @@ -1528,6 +1531,7 @@ pub fn register_plugins(store: &mut rustc_lint::LintStore, sess: &Session, conf: LintId::of(misc_early::DUPLICATE_UNDERSCORE_ARGUMENT), LintId::of(misc_early::MIXED_CASE_HEX_LITERALS), LintId::of(misc_early::REDUNDANT_PATTERN), + LintId::of(mixed_locale_idents::MIXED_LOCALE_IDENTS), LintId::of(mut_mutex_lock::MUT_MUTEX_LOCK), LintId::of(mut_reference::UNNECESSARY_MUT_PASSED), LintId::of(needless_borrow::NEEDLESS_BORROW), @@ -1924,6 +1928,7 @@ pub fn register_plugins(store: &mut rustc_lint::LintStore, sess: &Session, conf: store.register_late_pass(|| box eval_order_dependence::EvalOrderDependence); store.register_late_pass(|| box missing_doc::MissingDoc::new()); store.register_late_pass(|| box missing_inline::MissingInline); + store.register_late_pass(|| box mixed_locale_idents::MixedLocaleIdents); store.register_late_pass(move || box exhaustive_items::ExhaustiveItems); store.register_late_pass(|| box if_let_some_result::OkIfLet); store.register_late_pass(|| box partialeq_ne_impl::PartialEqNeImpl); diff --git a/clippy_lints/src/mixed_locale_idents.rs b/clippy_lints/src/mixed_locale_idents.rs new file mode 100644 index 000000000000..083f75caa4fe --- /dev/null +++ b/clippy_lints/src/mixed_locale_idents.rs @@ -0,0 +1,231 @@ +use clippy_utils::diagnostics::span_lint; +use rustc_data_structures::fx::FxHashMap; +use rustc_lint::{LateContext, LateLintPass}; +use rustc_session::{declare_tool_lint, impl_lint_pass}; +use rustc_span::{Span, Symbol}; +use unicode_script::{Script, UnicodeScript}; +use unicode_security::is_potential_mixed_script_confusable_char; + +declare_clippy_lint! { + /// **What it does:** Checks for usage of mixed locales in a single identifier part + /// with respect to the used case. + /// + /// **Why is this bad?** While it's OK for identifier to have different parts in different + /// locales, it may be confusing if multiple locales are used in a single word. + /// + /// **Known problems:** None. + /// + /// **Example:** + /// + /// The following code compiles without any warnings: + /// + /// ```rust + /// // In the `Block` identifier part, it's not a common latin `o` used, but rather Russian `о`. + /// // It's hard to see visually and the `mixed_script_confusables` warning is not spawned + /// // because of the `Блок` identifier part. + /// struct BlоckБлок; + /// + /// fn main() { + /// let _block = BlоckБлок; + /// } + /// ``` + /// + /// The same example, but with `Block` (english `o`) used instead of `Blоck` (russian `о`). + /// It will not compile. + /// + /// ```compile_fail + /// struct BlоckБлок; + /// + /// fn main() { + /// let _block = BlockБлок; + /// } + /// + /// // Compile output: + /// // + /// // error[E0425]: cannot find value `BlockБлок` in this scope + /// // --> src/main.rs:4:18 + /// // | + /// // 1 | struct BlоckБлок; + /// // | ----------------- similarly named unit struct `BlоckБлок` defined here + /// // ... + /// // 4 | let _block = BlockБлок; + /// // | ^^^^^^^^^ help: a unit struct with a similar name exists: `BlоckБлок` + /// ``` + pub MIXED_LOCALE_IDENTS, + style, + "multiple locales used in a single identifier part" +} + +#[derive(Clone, Debug)] +pub struct MixedLocaleIdents; + +impl_lint_pass!(MixedLocaleIdents => [MIXED_LOCALE_IDENTS]); + +/// Cases that normally can be met in the Rust identifiers. +#[derive(Debug)] +enum Case { + /// E.g. `SomeStruct`, delimiter is uppercase letter. + Camel, + /// E.g. `some_var` or `SOME_VAR, delimiter is `_`. + Snake, + /// E.g. `WHY_AmIWritingThisWay`, delimiters are both `_` and uppercase letter. + Mixed, +} + +fn detect_case(ident: &str) -> Case { + let mut has_underscore = false; + let mut has_upper = false; + let mut has_lower = false; + for c in ident.chars() { + has_underscore |= c == '_'; + has_upper |= c.is_uppercase(); + has_lower |= c.is_lowercase(); + } + + if has_upper && has_lower && has_underscore { + Case::Mixed + } else if !has_upper || !has_lower { + // Because of the statement above we are sure + // that we can't have mixed case already. + // If we have only one case, it's certainly snake case (either lowercase or uppercase). + Case::Snake + } else { + // The only option left. + Case::Camel + } +} + +/// Flag for having confusables in the identifier. +#[derive(Debug, Clone, Copy)] +enum ConfusablesState { + /// Identifier part currently only has confusables. + OnlyConfusables, + /// Identifier part has not only confusables. + NotConfusable, +} + +impl ConfusablesState { + fn from_char(c: char) -> Self { + if Self::is_confusable(c) { + Self::OnlyConfusables + } else { + Self::NotConfusable + } + } + + fn update(&mut self, c: char) { + let is_confusable = Self::is_confusable(c); + *self = match (*self, is_confusable) { + (Self::OnlyConfusables, true) => Self::OnlyConfusables, + _ => Self::NotConfusable, + }; + } + + fn is_confusable(c: char) -> bool { + match c.script() { + Script::Common | Script::Unknown | Script::Latin => { + // `Common` and `Unkown` are not categories we're interested in. + // `Latin` is the primary locale, we don't consider it to be confusable. + false + }, + _ => { + // Otherwise, actually check the symbol. + is_potential_mixed_script_confusable_char(c) + }, + } + } +} + +fn warning_required(locales: &FxHashMap) -> bool { + // If there is a single locale, we don't need a warning (e.g. `око` is a valid Russian + // word that consists of the consufables only). + if locales.len() <= 1 { + return false; + } + + // Otherwise, check whether at least one of locales consists of confusables only. + locales + .iter() + .any(|(_loc, state)| matches!(state, ConfusablesState::OnlyConfusables)) +} + +impl<'tcx> LateLintPass<'tcx> for MixedLocaleIdents { + fn check_name(&mut self, cx: &LateContext<'tcx>, span: Span, ident: Symbol) { + let ident_name = ident.to_string(); + + // First pass just to check whether identifier is fully ASCII, + // without any expensive actions. + // Most of identifiers are *expected* to be ASCII to it's better + // to return early if possible. + if ident_name.is_ascii() { + return; + } + + // Iterate through identifier with respect to its case + // and checks whether it contains mixed locales in a single identifier. + let ident_case = detect_case(&ident_name); + // Positions of the identifier part being checked. + // Used in report to render only relevant part. + let (mut ident_part_start, mut ident_part_end): (usize, usize) = (0, 0); + // List of locales used in the *identifier part* + let mut used_locales: FxHashMap = FxHashMap::default(); + for (id, symbol) in ident_name.chars().enumerate() { + // Check whether we've reached the next identifier part. + let is_next_identifier = match ident_case { + Case::Camel => symbol.is_uppercase(), + Case::Snake => symbol == '_', + Case::Mixed => symbol.is_uppercase() || symbol == '_', + }; + + // If the new identifier started, we should not include its part + // into the report. + if !is_next_identifier { + ident_part_end = id; + } + + if is_next_identifier { + if warning_required(&used_locales) { + // We've found the part that has multiple locales, + // no further analysis is required. + break; + } + // Clear everything and start checking the next identifier. + ident_part_start = id; + used_locales.clear(); + } + + let script = symbol.script(); + if script != Script::Common && script != Script::Unknown { + used_locales + .entry(script) + .and_modify(|e| e.update(symbol)) + .or_insert_with(|| ConfusablesState::from_char(symbol)); + } + } + + if warning_required(&used_locales) { + let ident_len = ident_part_end - ident_part_start + 1; + let ident_part: String = ident_name.chars().skip(ident_part_start).take(ident_len).collect(); + let locales: Vec<&'static str> = used_locales + .iter() + .filter_map(|(loc, state)| match state { + ConfusablesState::OnlyConfusables => Some(loc.full_name()), + ConfusablesState::NotConfusable => None, + }) + .collect(); + + assert!( + !locales.is_empty(), + "Warning is required; having no locales to report is a bug" + ); + + let message = format!( + "identifier part `{}` contains confusables-only symbols from the following locales: {}", + ident_part.trim_start_matches('_'), + locales.join(", "), + ); + + span_lint(cx, MIXED_LOCALE_IDENTS, span, &message); + } + } +} diff --git a/tests/ui/enum_variants.rs b/tests/ui/enum_variants.rs index 083f5143e6e4..ef6189d7a483 100644 --- a/tests/ui/enum_variants.rs +++ b/tests/ui/enum_variants.rs @@ -1,5 +1,5 @@ #![warn(clippy::enum_variant_names)] -#![allow(non_camel_case_types, clippy::upper_case_acronyms)] +#![allow(non_camel_case_types, clippy::upper_case_acronyms, clippy::mixed_locale_idents)] enum FakeCallType { CALL, diff --git a/tests/ui/mixed_locale_idents.rs b/tests/ui/mixed_locale_idents.rs new file mode 100644 index 000000000000..4c89131a304a --- /dev/null +++ b/tests/ui/mixed_locale_idents.rs @@ -0,0 +1,81 @@ +#![warn(clippy::mixed_locale_idents)] +#![allow(dead_code, non_camel_case_types, confusable_idents, non_upper_case_globals)] + +mod should_spawn_warnings { + // In the examples, cyrillic `о` is used in `Blоck`. + + pub struct Blоck; + pub const BLОCK: u8 = 42; + pub fn blоck() { + let blоck = 42u8; + } + + // Identifiers that consist of multiple parts. + pub struct SomeBlоckIdentifier; + pub const SOME_BLОCK_IDENT: u8 = 42; + pub fn some_blоck_fn() { + let some_blоck_var = 42u8; + } + pub struct Some_BlоckIdent; // Mixed case + + // Identifiers that have multiple matches. + // Only the first match should be reported. + pub struct BlоckClоck; + pub const BLОCK_CLОCK: u8 = 42; + pub fn blоck_clоck() { + let blоck_clоck = 42u8; + } + + // Identifiers that have both latin & non-latin word, and + // mixed case word. + pub struct SomeБлокBlоck; + + // Identifier that has 3 locales, one of which is not confusable, and one is. + // It must not complain about Chinese (as it's not confusable), but report + // Cyrillic instead. + pub struct Blоck看; +} + +mod should_not_spawn_warnings { + // In all the examples, `блок` is fully cyrillic. + + pub struct TryБлок; + pub const TRY_БЛОК: u8 = 42; + pub fn try_блок() { + let try_блок_var = 42u8; + } + // Mixed case + pub struct Some_БлокIdent; + + // Using non-confusables in ident together with confusables. + fn fnъуъ() { + let try看 = 42u8; + } + + // Using only lating confusables (`o` is latin). + fn ooo_блок() {} + + // One-word non-latin identifiers that contain non-confusables. + fn блок() {} +} + +// Checks to see that some edge cases do not cause panics. +mod render_tests { + // One-letter Latin identifier. Should not trigger the warning. + struct O; + + // One-letter Cyrillic identifier. + struct О; + + // Multiple underscores (`O` is cyrillic; `o` is latin). + const __ZZZ___О__: u8 = 42; + + const __ZZZ___Оo__: u8 = 42; +} + +fn main() { + // Additional examples that should *not* spawn the warning. + + // Should not spawn the lint, as it represents valid sequence in a single locale. + let nutzer_zähler = "user counter"; +} diff --git a/tests/ui/mixed_locale_idents.stderr b/tests/ui/mixed_locale_idents.stderr new file mode 100644 index 000000000000..de99b7357273 --- /dev/null +++ b/tests/ui/mixed_locale_idents.stderr @@ -0,0 +1,100 @@ +error: identifier part `Blоck` contains confusables-only symbols from the following locales: Cyrillic + --> $DIR/mixed_locale_idents.rs:7:16 + | +LL | pub struct Blоck; + | ^^^^^ + | + = note: `-D clippy::mixed-locale-idents` implied by `-D warnings` + +error: identifier part `BLОCK` contains confusables-only symbols from the following locales: Cyrillic + --> $DIR/mixed_locale_idents.rs:8:15 + | +LL | pub const BLОCK: u8 = 42; + | ^^^^^ + +error: identifier part `blоck` contains confusables-only symbols from the following locales: Cyrillic + --> $DIR/mixed_locale_idents.rs:9:12 + | +LL | pub fn blоck() { + | ^^^^^ + +error: identifier part `blоck` contains confusables-only symbols from the following locales: Cyrillic + --> $DIR/mixed_locale_idents.rs:10:13 + | +LL | let blоck = 42u8; + | ^^^^^ + +error: identifier part `Blоck` contains confusables-only symbols from the following locales: Cyrillic + --> $DIR/mixed_locale_idents.rs:14:16 + | +LL | pub struct SomeBlоckIdentifier; + | ^^^^^^^^^^^^^^^^^^^ + +error: identifier part `BLОCK` contains confusables-only symbols from the following locales: Cyrillic + --> $DIR/mixed_locale_idents.rs:15:15 + | +LL | pub const SOME_BLОCK_IDENT: u8 = 42; + | ^^^^^^^^^^^^^^^^ + +error: identifier part `blоck` contains confusables-only symbols from the following locales: Cyrillic + --> $DIR/mixed_locale_idents.rs:16:12 + | +LL | pub fn some_blоck_fn() { + | ^^^^^^^^^^^^^ + +error: identifier part `blоck` contains confusables-only symbols from the following locales: Cyrillic + --> $DIR/mixed_locale_idents.rs:17:13 + | +LL | let some_blоck_var = 42u8; + | ^^^^^^^^^^^^^^ + +error: identifier part `Blоck` contains confusables-only symbols from the following locales: Cyrillic + --> $DIR/mixed_locale_idents.rs:19:16 + | +LL | pub struct Some_BlоckIdent; // Mixed case + | ^^^^^^^^^^^^^^^ + +error: identifier part `Blоck` contains confusables-only symbols from the following locales: Cyrillic + --> $DIR/mixed_locale_idents.rs:23:16 + | +LL | pub struct BlоckClоck; + | ^^^^^^^^^^ + +error: identifier part `BLОCK` contains confusables-only symbols from the following locales: Cyrillic + --> $DIR/mixed_locale_idents.rs:24:15 + | +LL | pub const BLОCK_CLОCK: u8 = 42; + | ^^^^^^^^^^^ + +error: identifier part `blоck` contains confusables-only symbols from the following locales: Cyrillic + --> $DIR/mixed_locale_idents.rs:25:12 + | +LL | pub fn blоck_clоck() { + | ^^^^^^^^^^^ + +error: identifier part `blоck` contains confusables-only symbols from the following locales: Cyrillic + --> $DIR/mixed_locale_idents.rs:26:13 + | +LL | let blоck_clоck = 42u8; + | ^^^^^^^^^^^ + +error: identifier part `Blоck` contains confusables-only symbols from the following locales: Cyrillic + --> $DIR/mixed_locale_idents.rs:31:16 + | +LL | pub struct SomeБлокBlоck; + | ^^^^^^^^^^^^^ + +error: identifier part `Blоck看` contains confusables-only symbols from the following locales: Cyrillic + --> $DIR/mixed_locale_idents.rs:36:16 + | +LL | pub struct Blоck看; + | ^^^^^^^ + +error: identifier part `Оo` contains confusables-only symbols from the following locales: Cyrillic + --> $DIR/mixed_locale_idents.rs:73:11 + | +LL | const __ZZZ___Оo__: u8 = 42; + | ^^^^^^^^^^^^ + +error: aborting due to 16 previous errors +