Skip to content

fix: check for duplicates in keyword validate #14585

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

polvalente
Copy link
Contributor

@polvalente polvalente commented Jun 17, 2025

closes #14584

I've run benchmarks with Benchee, results below. Unfortunately, it seems that as the lists grow bigger, the execution time difference becomes very significative. The test cases are very artificial, but the impact is there.

in the benchmark, Keyword.validate is the version included in this PR,
while the other 2 versions are in the module below:

defmodule BenchmarkKeyword do
  defmodule FrequencyMap do
    def validate(keyword, values) when is_list(keyword) and is_list(values) do
      case dedup_values(values) do
        :ok ->
          validate(keyword, values, [], [], [])

        {:error, bad_keys} ->
          {:error, bad_keys}
      end
    end

    defp dedup_values(values, acc \\ %{})

    defp dedup_values([], _acc) do
      :ok
    end

    defp dedup_values([{key, _} | rest], acc) when is_atom(key) and is_map_key(acc, key) do
      {:error, [key]}
    end

    defp dedup_values([key | rest], acc) when is_atom(key) and is_map_key(acc, key) do
      {:error, [key]}
    end

    defp dedup_values([h | rest], acc) do
      dedup_values(rest, Map.put(acc, h, 1))
    end

    defp validate([{key, _} = pair | keyword], values1, values2, acc, bad_keys)
         when is_atom(key) do
      case find_key!(key, values1, values2) do
        {values1, values2} ->
          validate(keyword, values1, values2, [pair | acc], bad_keys)

        :error ->
          case find_key!(key, values2, values1) do
            {values1, values2} ->
              validate(keyword, values1, values2, [pair | acc], bad_keys)

            :error ->
              validate(keyword, values1, values2, acc, [key | bad_keys])
          end
      end
    end

    defp validate([], values1, values2, acc, []) do
      with {:ok, acc} <- move_pairs!(acc, {[], %{}}),
           {:ok, acc} <- move_pairs!(values2, acc),
           {:ok, {list, _freq_map}} <- move_pairs!(values1, acc) do
        {:ok, list}
      end
    end

    defp validate([], _values1, _values2, _acc, bad_keys) do
      {:error, bad_keys}
    end

    defp validate([pair | _], _values1, _values2, _acc, []) do
      raise ArgumentError,
            "expected a keyword list as first argument, got invalid entry: #{inspect(pair)}"
    end

    defp find_key!(key, [key | rest], acc), do: {rest, acc}
    defp find_key!(key, [{key, _} | rest], acc), do: {rest, acc}
    defp find_key!(key, [head | tail], acc), do: find_key!(key, tail, [head | acc])
    defp find_key!(_key, [], _acc), do: :error

    defp move_pairs!([key | rest], acc) when is_atom(key),
      do: move_pairs!(rest, acc)

    defp move_pairs!([{key, _} = pair | rest], acc) when is_atom(key),
      do: move_pairs!(rest, [pair | acc])

    defp move_pairs!([], acc), do: {:ok, acc}

    defp move_pairs!([other | _], _) do
      raise ArgumentError,
            "expected the second argument to be a list of atoms or tuples, got: #{inspect(other)}"
    end

    # defp move_pairs!([key | rest], acc) when is_atom(key),
    #   do: move_pairs!(rest, acc)

    # defp move_pairs!([{key, _} | _], {_acc, frequency_map})
    #      when is_atom(key) and is_map_key(frequency_map, key) do
    #   {:error, [key]}
    # end

    # defp move_pairs!([{key, _} = pair | rest], {acc, frequency_map}) when is_atom(key),
    #   do: move_pairs!(rest, {[pair | acc], Map.put(frequency_map, key, 1)})

    # defp move_pairs!([], acc), do: {:ok, acc}

    # defp move_pairs!([other | _], _) do
    #   raise ArgumentError,
    #         "expected the second argument to be a list of atoms or tuples, got: #{inspect(other)}"
    # end
  end

  def run do
    Benchee.run(
      %{
        "validate with dedup (freq map)" => &apply(__MODULE__.FrequencyMap, :validate, &1),
        "validate without dedup" => &apply(__MODULE__, :old_validate, &1),
        "validate with dedup" => &apply(Keyword, :validate, &1)
      },
      inputs: %{
        "10 duplicate start" => build_lists(10, :start),
        # "10 duplicate end" => build_lists(10, :end),
        "100 duplicate start" => build_lists(100, :start),
        # "100 duplicate end" => build_lists(100, :end),
        "10000 duplicate start" => build_lists(10000, :start)
        # "10000 duplicate end" => build_lists(10000, :end)
      }
    )

    :ok
  end

  defp build_lists(len, duplicate) do
    list =
      for idx <- 1..len do
        key = idx |> to_string() |> String.pad_leading(5, "0") |> String.to_atom()
        {key, idx}
      end

    l2 =
      case duplicate do
        :end ->
          list ++ [List.last(list)]

        :start ->
          [hd(list)] ++ list
      end

    [list, l2]
  end

  def old_validate(keyword, values) when is_list(keyword) and is_list(values) do
    validate(keyword, values, [], [], [])
  end

  defp validate([{key, _} = pair | keyword], values1, values2, acc, bad_keys) when is_atom(key) do
    case find_key!(key, values1, values2) do
      {values1, values2} ->
        validate(keyword, values1, values2, [pair | acc], bad_keys)

      :error ->
        case find_key!(key, values2, values1) do
          {values1, values2} ->
            validate(keyword, values1, values2, [pair | acc], bad_keys)

          :error ->
            validate(keyword, values1, values2, acc, [key | bad_keys])
        end
    end
  end

  defp validate([], values1, values2, acc, []) do
    list = move_pairs!(values1, move_pairs!(values2, acc))
    {:ok, list}
  end

  defp validate([], _values1, _values2, _acc, bad_keys) do
    {:error, bad_keys}
  end

  defp validate([pair | _], _values1, _values2, _acc, []) do
    raise ArgumentError,
          "expected a keyword list as first argument, got invalid entry: #{inspect(pair)}"
  end

  defp find_key!(key, [key | rest], acc), do: {rest, acc}
  defp find_key!(key, [{key, _} | rest], acc), do: {rest, acc}
  defp find_key!(key, [head | tail], acc), do: find_key!(key, tail, [head | acc])
  defp find_key!(_key, [], _acc), do: :error

  defp move_pairs!([key | rest], acc) when is_atom(key),
    do: move_pairs!(rest, acc)

  defp move_pairs!([{key, _} = pair | rest], acc) when is_atom(key),
    do: move_pairs!(rest, [pair | acc])

  defp move_pairs!([], acc),
    do: acc

  defp move_pairs!([other | _], _) do
    raise ArgumentError,
          "expected the second argument to be a list of atoms or tuples, got: #{inspect(other)}"
  end
end
 BenchmarkKeyword.run
Operating System: macOS
CPU Information: Apple M1 Pro
Number of Available Cores: 10
Available memory: 32 GB
Elixir 1.20.0-dev
Erlang 27.2.1
JIT enabled: true

Benchmark suite executing with the following configuration:
warmup: 2 s
time: 5 s
memory time: 0 ns
reduction time: 0 ns
parallel: 1
inputs: 10 duplicate start, 100 duplicate start, 10000 duplicate start
Estimated total run time: 1 min 3 s

Benchmarking validate with dedup with input 10 duplicate start ...
Benchmarking validate with dedup with input 100 duplicate start ...
Benchmarking validate with dedup with input 10000 duplicate start ...
Benchmarking validate with dedup (freq map) with input 10 duplicate start ...
Benchmarking validate with dedup (freq map) with input 100 duplicate start ...
Benchmarking validate with dedup (freq map) with input 10000 duplicate start ...
Benchmarking validate without dedup with input 10 duplicate start ...
Benchmarking validate without dedup with input 100 duplicate start ...
Benchmarking validate without dedup with input 10000 duplicate start ...
Calculating statistics...
Formatting results...

##### With input 10 duplicate start #####
Name                                     ips        average  deviation         median         99th %
validate without dedup                6.44 M      155.26 ns ±23763.10%          83 ns         208 ns
validate with dedup                   1.53 M      654.71 ns  ±2965.51%         583 ns         750 ns
validate with dedup (freq map)        0.89 M     1127.21 ns  ±1322.90%        1042 ns        1250 ns

Comparison: 
validate without dedup                6.44 M
validate with dedup                   1.53 M - 4.22x slower +499.45 ns
validate with dedup (freq map)        0.89 M - 7.26x slower +971.95 ns

##### With input 100 duplicate start #####
Name                                     ips        average  deviation         median         99th %
validate without dedup             1795.30 K        0.56 μs  ±4502.22%        0.46 μs        0.67 μs
validate with dedup                 167.53 K        5.97 μs   ±270.78%        5.67 μs       12.38 μs
validate with dedup (freq map)       69.11 K       14.47 μs    ±21.05%       13.88 μs       24.97 μs

Comparison: 
validate without dedup             1795.30 K
validate with dedup                 167.53 K - 10.72x slower +5.41 μs
validate with dedup (freq map)       69.11 K - 25.98x slower +13.91 μs

##### With input 10000 duplicate start #####
Name                                     ips        average  deviation         median         99th %
validate without dedup               15.27 K       65.51 μs    ±29.19%       67.67 μs       97.05 μs
validate with dedup                   1.02 K      984.61 μs    ±12.41%      947.92 μs     1469.24 μs
validate with dedup (freq map)        0.48 K     2089.97 μs     ±8.77%     2044.79 μs     2709.85 μs

Comparison: 
validate without dedup               15.27 K
validate with dedup                   1.02 K - 15.03x slower +919.10 μs
validate with dedup (freq map)        0.48 K - 31.90x slower +2024.46 μs
:ok

Comment on lines 283 to 293
defp validate([], values1, values2, acc, []) do
{:ok, move_pairs!(values1, move_pairs!(values2, acc))}
list = move_pairs!(values1, move_pairs!(values2, acc))

{has_duplicate, duplicate_key} = find_duplicate_keys(list)

if has_duplicate do
{:error, [duplicate_key]}
else
{:ok, list}
end
end
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@josevalim I've put a benchmark in the PR description which I think is encouraging.

If it makes sense to move forward with this solution I'll add doctests in validate/2 and validate!/2

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm re-running benchmarks because I realized I was testing 2 very close cases due to alphabetical ordering.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the updated results it's clear that there is an impact in checking the duplicates.
I did get bitten in production by this, so perhaps we could introduce validate/3 in the worst case scenario where there's an opt-in flag for this check

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if there's a more optimal version possible by merging the functionallity of move_pairs! and find_duplicate_keys in 1.
It's going to probably be more complex, but maybe a bit more efficient?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about adding a map of key => frequency to move pairs, initialized to the equivalent value from acc

Then at the end of move pairs, just traverse that and return the keys with count > 1

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@josevalim @tcoopman I updated the benchmark with a compiled module (also attached in the PR body).

I didn't have much luck with getting results as good as the ones without the validation. In absolute time it's not much of a difference, but it's a very significative ratio

@josevalim
Copy link
Member

The benchmark has evaluation warnings. Can you make sure to move all benchmarking code to a module and then call it instead? Thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Development

Successfully merging this pull request may close these issues.

Keyword.validate/2 has unpredictable behavior when accepted keys have duplicate entries
3 participants