Skip to content

Add new lint: Mixed locale ident #7376

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2663,6 +2663,7 @@ Released 2018-09-13
[`missing_safety_doc`]: https://rust-lang.github.io/rust-clippy/master/index.html#missing_safety_doc
[`mistyped_literal_suffixes`]: https://rust-lang.github.io/rust-clippy/master/index.html#mistyped_literal_suffixes
[`mixed_case_hex_literals`]: https://rust-lang.github.io/rust-clippy/master/index.html#mixed_case_hex_literals
[`mixed_locale_idents`]: https://rust-lang.github.io/rust-clippy/master/index.html#mixed_locale_idents
[`module_inception`]: https://rust-lang.github.io/rust-clippy/master/index.html#module_inception
[`module_name_repetitions`]: https://rust-lang.github.io/rust-clippy/master/index.html#module_name_repetitions
[`modulo_arithmetic`]: https://rust-lang.github.io/rust-clippy/master/index.html#modulo_arithmetic
Expand Down
2 changes: 2 additions & 0 deletions clippy_lints/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ serde = { version = "1.0", features = ["derive"] }
serde_json = { version = "1.0", optional = true }
toml = "0.5.3"
unicode-normalization = "0.1"
unicode-script = { version = "0.5.1", default-features = false }
unicode-security = { version = "0.0.5", default-features = false }
semver = "0.11"
rustc-semver = "1.1.0"
# NOTE: cargo requires serde feat in its url dep
Expand Down
5 changes: 5 additions & 0 deletions clippy_lints/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ mod misc_early;
mod missing_const_for_fn;
mod missing_doc;
mod missing_inline;
mod mixed_locale_idents;
mod modulo_arithmetic;
mod multiple_crate_versions;
mod mut_key;
Expand Down Expand Up @@ -813,6 +814,7 @@ pub fn register_plugins(store: &mut rustc_lint::LintStore, sess: &Session, conf:
missing_const_for_fn::MISSING_CONST_FOR_FN,
missing_doc::MISSING_DOCS_IN_PRIVATE_ITEMS,
missing_inline::MISSING_INLINE_IN_PUBLIC_ITEMS,
mixed_locale_idents::MIXED_LOCALE_IDENTS,
modulo_arithmetic::MODULO_ARITHMETIC,
multiple_crate_versions::MULTIPLE_CRATE_VERSIONS,
mut_key::MUTABLE_KEY_TYPE,
Expand Down Expand Up @@ -1341,6 +1343,7 @@ pub fn register_plugins(store: &mut rustc_lint::LintStore, sess: &Session, conf:
LintId::of(misc_early::REDUNDANT_PATTERN),
LintId::of(misc_early::UNNEEDED_WILDCARD_PATTERN),
LintId::of(misc_early::ZERO_PREFIXED_LITERAL),
LintId::of(mixed_locale_idents::MIXED_LOCALE_IDENTS),
LintId::of(mut_key::MUTABLE_KEY_TYPE),
LintId::of(mut_mutex_lock::MUT_MUTEX_LOCK),
LintId::of(mut_reference::UNNECESSARY_MUT_PASSED),
Expand Down Expand Up @@ -1528,6 +1531,7 @@ pub fn register_plugins(store: &mut rustc_lint::LintStore, sess: &Session, conf:
LintId::of(misc_early::DUPLICATE_UNDERSCORE_ARGUMENT),
LintId::of(misc_early::MIXED_CASE_HEX_LITERALS),
LintId::of(misc_early::REDUNDANT_PATTERN),
LintId::of(mixed_locale_idents::MIXED_LOCALE_IDENTS),
LintId::of(mut_mutex_lock::MUT_MUTEX_LOCK),
LintId::of(mut_reference::UNNECESSARY_MUT_PASSED),
LintId::of(needless_borrow::NEEDLESS_BORROW),
Expand Down Expand Up @@ -1924,6 +1928,7 @@ pub fn register_plugins(store: &mut rustc_lint::LintStore, sess: &Session, conf:
store.register_late_pass(|| box eval_order_dependence::EvalOrderDependence);
store.register_late_pass(|| box missing_doc::MissingDoc::new());
store.register_late_pass(|| box missing_inline::MissingInline);
store.register_late_pass(|| box mixed_locale_idents::MixedLocaleIdents);
store.register_late_pass(move || box exhaustive_items::ExhaustiveItems);
store.register_late_pass(|| box if_let_some_result::OkIfLet);
store.register_late_pass(|| box partialeq_ne_impl::PartialEqNeImpl);
Expand Down
231 changes: 231 additions & 0 deletions clippy_lints/src/mixed_locale_idents.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
use clippy_utils::diagnostics::span_lint;
use rustc_data_structures::fx::FxHashMap;
use rustc_lint::{LateContext, LateLintPass};
use rustc_session::{declare_tool_lint, impl_lint_pass};
use rustc_span::{Span, Symbol};
use unicode_script::{Script, UnicodeScript};
use unicode_security::is_potential_mixed_script_confusable_char;

declare_clippy_lint! {
/// **What it does:** Checks for usage of mixed locales in a single identifier part
/// with respect to the used case.
///
/// **Why is this bad?** While it's OK for identifier to have different parts in different
/// locales, it may be confusing if multiple locales are used in a single word.
///
/// **Known problems:** None.
///
/// **Example:**
///
/// The following code compiles without any warnings:
///
/// ```rust
/// // In the `Block` identifier part, it's not a common latin `o` used, but rather Russian `о`.
/// // It's hard to see visually and the `mixed_script_confusables` warning is not spawned
/// // because of the `Блок` identifier part.
/// struct BlоckБлок;
///
/// fn main() {
/// let _block = BlоckБлок;
/// }
/// ```
///
/// The same example, but with `Block` (english `o`) used instead of `Blоck` (russian `о`).
/// It will not compile.
///
/// ```compile_fail
/// struct BlоckБлок;
///
/// fn main() {
/// let _block = BlockБлок;
/// }
///
/// // Compile output:
/// //
/// // error[E0425]: cannot find value `BlockБлок` in this scope
/// // --> src/main.rs:4:18
/// // |
/// // 1 | struct BlоckБлок;
/// // | ----------------- similarly named unit struct `BlоckБлок` defined here
/// // ...
/// // 4 | let _block = BlockБлок;
/// // | ^^^^^^^^^ help: a unit struct with a similar name exists: `BlоckБлок`
/// ```
pub MIXED_LOCALE_IDENTS,
style,
"multiple locales used in a single identifier part"
}

#[derive(Clone, Debug)]
pub struct MixedLocaleIdents;

impl_lint_pass!(MixedLocaleIdents => [MIXED_LOCALE_IDENTS]);

/// Cases that normally can be met in the Rust identifiers.
#[derive(Debug)]
enum Case {
/// E.g. `SomeStruct`, delimiter is uppercase letter.
Camel,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay so a problem with this is that not all writing systems have uppercase. More thought needs to be put into how that will work.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But perhaps using only mixed script confusables without underscores in a character is enough for us for now. Idk.

/// E.g. `some_var` or `SOME_VAR, delimiter is `_`.
Snake,
/// E.g. `WHY_AmIWritingThisWay`, delimiters are both `_` and uppercase letter.
Mixed,
}

fn detect_case(ident: &str) -> Case {
let mut has_underscore = false;
let mut has_upper = false;
let mut has_lower = false;
for c in ident.chars() {
has_underscore |= c == '_';
has_upper |= c.is_uppercase();
has_lower |= c.is_lowercase();
}

if has_upper && has_lower && has_underscore {
Case::Mixed
} else if !has_upper || !has_lower {
// Because of the statement above we are sure
// that we can't have mixed case already.
// If we have only one case, it's certainly snake case (either lowercase or uppercase).
Case::Snake
} else {
// The only option left.
Case::Camel
}
}

/// Flag for having confusables in the identifier.
#[derive(Debug, Clone, Copy)]
enum ConfusablesState {
/// Identifier part currently only has confusables.
OnlyConfusables,
/// Identifier part has not only confusables.
NotConfusable,
}

impl ConfusablesState {
fn from_char(c: char) -> Self {
if Self::is_confusable(c) {
Self::OnlyConfusables
} else {
Self::NotConfusable
}
}

fn update(&mut self, c: char) {
let is_confusable = Self::is_confusable(c);
*self = match (*self, is_confusable) {
(Self::OnlyConfusables, true) => Self::OnlyConfusables,
_ => Self::NotConfusable,
};
}

fn is_confusable(c: char) -> bool {
match c.script() {
Script::Common | Script::Unknown | Script::Latin => {
// `Common` and `Unkown` are not categories we're interested in.
// `Latin` is the primary locale, we don't consider it to be confusable.
false
},
_ => {
// Otherwise, actually check the symbol.
is_potential_mixed_script_confusable_char(c)
},
}
}
}

fn warning_required(locales: &FxHashMap<Script, ConfusablesState>) -> bool {
// If there is a single locale, we don't need a warning (e.g. `око` is a valid Russian
// word that consists of the consufables only).
if locales.len() <= 1 {
return false;
}

// Otherwise, check whether at least one of locales consists of confusables only.
locales
.iter()
.any(|(_loc, state)| matches!(state, ConfusablesState::OnlyConfusables))
}

impl<'tcx> LateLintPass<'tcx> for MixedLocaleIdents {
fn check_name(&mut self, cx: &LateContext<'tcx>, span: Span, ident: Symbol) {
let ident_name = ident.to_string();

// First pass just to check whether identifier is fully ASCII,
// without any expensive actions.
// Most of identifiers are *expected* to be ASCII to it's better
// to return early if possible.
if ident_name.is_ascii() {
return;
}

// Iterate through identifier with respect to its case
// and checks whether it contains mixed locales in a single identifier.
let ident_case = detect_case(&ident_name);
// Positions of the identifier part being checked.
// Used in report to render only relevant part.
let (mut ident_part_start, mut ident_part_end): (usize, usize) = (0, 0);
// List of locales used in the *identifier part*
let mut used_locales: FxHashMap<Script, ConfusablesState> = FxHashMap::default();
for (id, symbol) in ident_name.chars().enumerate() {
// Check whether we've reached the next identifier part.
let is_next_identifier = match ident_case {
Case::Camel => symbol.is_uppercase(),
Case::Snake => symbol == '_',
Case::Mixed => symbol.is_uppercase() || symbol == '_',
};

// If the new identifier started, we should not include its part
// into the report.
if !is_next_identifier {
ident_part_end = id;
}

if is_next_identifier {
if warning_required(&used_locales) {
// We've found the part that has multiple locales,
// no further analysis is required.
break;
}
// Clear everything and start checking the next identifier.
ident_part_start = id;
used_locales.clear();
}

let script = symbol.script();
if script != Script::Common && script != Script::Unknown {
used_locales
.entry(script)
.and_modify(|e| e.update(symbol))
.or_insert_with(|| ConfusablesState::from_char(symbol));
}
}

if warning_required(&used_locales) {
let ident_len = ident_part_end - ident_part_start + 1;
let ident_part: String = ident_name.chars().skip(ident_part_start).take(ident_len).collect();
let locales: Vec<&'static str> = used_locales
.iter()
.filter_map(|(loc, state)| match state {
ConfusablesState::OnlyConfusables => Some(loc.full_name()),
ConfusablesState::NotConfusable => None,
})
.collect();

assert!(
!locales.is_empty(),
"Warning is required; having no locales to report is a bug"
);

let message = format!(
"identifier part `{}` contains confusables-only symbols from the following locales: {}",
ident_part.trim_start_matches('_'),
locales.join(", "),
);

span_lint(cx, MIXED_LOCALE_IDENTS, span, &message);
}
}
}
2 changes: 1 addition & 1 deletion tests/ui/enum_variants.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#![warn(clippy::enum_variant_names)]
#![allow(non_camel_case_types, clippy::upper_case_acronyms)]
#![allow(non_camel_case_types, clippy::upper_case_acronyms, clippy::mixed_locale_idents)]

enum FakeCallType {
CALL,
Expand Down
81 changes: 81 additions & 0 deletions tests/ui/mixed_locale_idents.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#![warn(clippy::mixed_locale_idents)]
#![allow(dead_code, non_camel_case_types, confusable_idents, non_upper_case_globals)]

mod should_spawn_warnings {
// In the examples, cyrillic `о` is used in `Blоck`.

pub struct Blоck;
pub const BLОCK: u8 = 42;
pub fn blоck() {
let blоck = 42u8;
}

// Identifiers that consist of multiple parts.
pub struct SomeBlоckIdentifier;
pub const SOME_BLОCK_IDENT: u8 = 42;
pub fn some_blоck_fn() {
let some_blоck_var = 42u8;
}
pub struct Some_BlоckIdent; // Mixed case

// Identifiers that have multiple matches.
// Only the first match should be reported.
pub struct BlоckClоck;
pub const BLОCK_CLОCK: u8 = 42;
pub fn blоck_clоck() {
let blоck_clоck = 42u8;
}

// Identifiers that have both latin & non-latin word, and
// mixed case word.
pub struct SomeБлокBlоck;

// Identifier that has 3 locales, one of which is not confusable, and one is.
// It must not complain about Chinese (as it's not confusable), but report
// Cyrillic instead.
pub struct Blоck看;
}

mod should_not_spawn_warnings {
// In all the examples, `блок` is fully cyrillic.

pub struct TryБлок;
pub const TRY_БЛОК: u8 = 42;
pub fn try_блок() {
let try_блок_var = 42u8;
}
// Mixed case
pub struct Some_БлокIdent;

// Using non-confusables in ident together with confusables.
fn fnъуъ() {
let try看 = 42u8;
}

// Using only lating confusables (`o` is latin).
fn ooo_блок() {}

// One-word non-latin identifiers that contain non-confusables.
fn блок() {}
}

// Checks to see that some edge cases do not cause panics.
mod render_tests {
// One-letter Latin identifier. Should not trigger the warning.
struct O;

// One-letter Cyrillic identifier.
struct О;

// Multiple underscores (`O` is cyrillic; `o` is latin).
const __ZZZ___О__: u8 = 42;

const __ZZZ___Оo__: u8 = 42;
}

fn main() {
// Additional examples that should *not* spawn the warning.

// Should not spawn the lint, as it represents valid sequence in a single locale.
let nutzer_zähler = "user counter";
}
Loading