Skip to content

bpo-47080: Use atomic groups to simplify fnmatch #32029

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 5 additions & 19 deletions Lib/fnmatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,6 @@

__all__ = ["filter", "fnmatch", "fnmatchcase", "translate"]

# Build a thread-safe incrementing counter to help create unique regexp group
# names across calls.
from itertools import count
_nextgroupnum = count().__next__
del count

def fnmatch(name, pat):
"""Test whether FILENAME matches PATTERN.

Expand Down Expand Up @@ -149,17 +143,10 @@ def translate(pat):
# Now deal with STAR fixed STAR fixed ...
# For an interior `STAR fixed` pairing, we want to do a minimal
# .*? match followed by `fixed`, with no possibility of backtracking.
# We can't spell that directly, but can trick it into working by matching
# .*?fixed
# in a lookahead assertion, save the matched part in a group, then
# consume that group via a backreference. If the overall match fails,
# the lookahead assertion won't try alternatives. So the translation is:
# (?=(?P<name>.*?fixed))(?P=name)
# Group names are created as needed: g0, g1, g2, ...
# The numbers are obtained from _nextgroupnum() to ensure they're unique
# across calls and across threads. This is because people rely on the
# undocumented ability to join multiple translate() results together via
# "|" to build large regexps matching "one of many" shell patterns.
# Atomic groups ("(?>...)") allow us to spell that directly.
# Note: people rely on the undocumented ability to join multiple
# translate() results together via "|" to build large regexps matching
# "one of many" shell patterns.
while i < n:
assert inp[i] is STAR
i += 1
Expand All @@ -176,8 +163,7 @@ def translate(pat):
add(".*")
add(fixed)
else:
groupnum = _nextgroupnum()
add(f"(?=(?P<g{groupnum}>.*?{fixed}))(?P=g{groupnum})")
add(f"(?>.*?{fixed})")
assert i == n
res = "".join(res)
return fr'(?s:{res})\Z'
12 changes: 2 additions & 10 deletions Lib/test/test_fnmatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,17 +124,9 @@ def test_translate(self):
self.assertEqual(translate('A*********?[?]?'), r'(?s:A.*.[?].)\Z')
# fancy translation to prevent exponential-time match failure
t = translate('**a*a****a')
digits = re.findall(r'\d+', t)
self.assertEqual(len(digits), 4)
self.assertEqual(digits[0], digits[1])
self.assertEqual(digits[2], digits[3])
g1 = f"g{digits[0]}" # e.g., group name "g4"
g2 = f"g{digits[2]}" # e.g., group name "g5"
self.assertEqual(t,
fr'(?s:(?=(?P<{g1}>.*?a))(?P={g1})(?=(?P<{g2}>.*?a))(?P={g2}).*a)\Z')
self.assertEqual(t, r'(?s:(?>.*?a)(?>.*?a).*a)\Z')
# and try pasting multiple translate results - it's an undocumented
# feature that this works; all the pain of generating unique group
# names across calls exists to support this
# feature that this works
r1 = translate('**a**a**a*')
r2 = translate('**b**b**b*')
r3 = translate('*c*c*c*')
Expand Down