bpo-31690: Allow the inline flags "a", "L", and "u" to be used as gro… · python/cpython@3557b05
@@ -62,6 +62,12 @@
6262_ignorecase_fixes = {i: tuple(j for j in t if i != j)
6363for t in _equivalences for i in t}
646465+def _combine_flags(flags, add_flags, del_flags,
66+TYPE_FLAGS=sre_parse.TYPE_FLAGS):
67+if add_flags & TYPE_FLAGS:
68+flags &= ~TYPE_FLAGS
69+return (flags | add_flags) & ~del_flags
70+6571def _compile(code, pattern, flags):
6672# internal: compile a (sub)pattern
6773emit = code.append
@@ -87,15 +93,21 @@ def _compile(code, pattern, flags):
8793emit(op)
8894emit(av)
8995elif flags & SRE_FLAG_LOCALE:
90-emit(OP_LOC_IGNORE[op])
96+emit(OP_LOCALE_IGNORE[op])
9197emit(av)
9298elif not iscased(av):
9399emit(op)
94100emit(av)
95101else:
96102lo = tolower(av)
97-if fixes and lo in fixes:
98-emit(IN_IGNORE)
103+if not fixes: # ascii
104+emit(OP_IGNORE[op])
105+emit(lo)
106+elif lo not in fixes:
107+emit(OP_UNICODE_IGNORE[op])
108+emit(lo)
109+else:
110+emit(IN_UNI_IGNORE)
99111skip = _len(code); emit(0)
100112if op is NOT_LITERAL:
101113emit(NEGATE)
@@ -104,17 +116,16 @@ def _compile(code, pattern, flags):
104116emit(k)
105117emit(FAILURE)
106118code[skip] = _len(code) - skip
107-else:
108-emit(OP_IGNORE[op])
109-emit(lo)
110119elif op is IN:
111120charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
112121if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
113122emit(IN_LOC_IGNORE)
114-elif hascased:
123+elif not hascased:
124+emit(IN)
125+elif not fixes: # ascii
115126emit(IN_IGNORE)
116127else:
117-emit(IN)
128+emit(IN_UNI_IGNORE)
118129skip = _len(code); emit(0)
119130_compile_charset(charset, flags, code)
120131code[skip] = _len(code) - skip
@@ -153,8 +164,8 @@ def _compile(code, pattern, flags):
153164if group:
154165emit(MARK)
155166emit((group-1)*2)
156-# _compile_info(code, p, (flags | add_flags) & ~del_flags)
157-_compile(code, p, (flags | add_flags) & ~del_flags)
167+# _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
168+_compile(code, p, _combine_flags(flags, add_flags, del_flags))
158169if group:
159170emit(MARK)
160171emit((group-1)*2+1)
@@ -210,10 +221,14 @@ def _compile(code, pattern, flags):
210221av = CH_UNICODE[av]
211222emit(av)
212223elif op is GROUPREF:
213-if flags & SRE_FLAG_IGNORECASE:
214-emit(OP_IGNORE[op])
215-else:
224+if not flags & SRE_FLAG_IGNORECASE:
216225emit(op)
226+elif flags & SRE_FLAG_LOCALE:
227+emit(GROUPREF_LOC_IGNORE)
228+elif not fixes: # ascii
229+emit(GROUPREF_IGNORE)
230+else:
231+emit(GROUPREF_UNI_IGNORE)
217232emit(av-1)
218233elif op is GROUPREF_EXISTS:
219234emit(op)
@@ -240,7 +255,7 @@ def _compile_charset(charset, flags, code):
240255pass
241256elif op is LITERAL:
242257emit(av)
243-elif op is RANGE or op is RANGE_IGNORE:
258+elif op is RANGE or op is RANGE_UNI_IGNORE:
244259emit(av[0])
245260emit(av[1])
246261elif op is CHARSET:
@@ -309,9 +324,9 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
309324hascased = True
310325# There are only two ranges of cased non-BMP characters:
311326# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
312-# and for both ranges RANGE_IGNORE works.
327+# and for both ranges RANGE_UNI_IGNORE works.
313328if op is RANGE:
314-op = RANGE_IGNORE
329+op = RANGE_UNI_IGNORE
315330tail.append((op, av))
316331break
317332@@ -456,7 +471,7 @@ def _get_literal_prefix(pattern, flags):
456471prefixappend(av)
457472elif op is SUBPATTERN:
458473group, add_flags, del_flags, p = av
459-flags1 = (flags | add_flags) & ~del_flags
474+flags1 = _combine_flags(flags, add_flags, del_flags)
460475if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
461476break
462477prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
@@ -482,7 +497,7 @@ def _get_charset_prefix(pattern, flags):
482497if op is not SUBPATTERN:
483498break
484499group, add_flags, del_flags, pattern = av
485-flags = (flags | add_flags) & ~del_flags
500+flags = _combine_flags(flags, add_flags, del_flags)
486501if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
487502return None
488503@@ -631,6 +646,7 @@ def print_2(*args):
631646print_(op)
632647elif op in (LITERAL, NOT_LITERAL,
633648LITERAL_IGNORE, NOT_LITERAL_IGNORE,
649+LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
634650LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
635651arg = code[i]
636652i += 1
@@ -647,12 +663,12 @@ def print_2(*args):
647663arg = str(CHCODES[arg])
648664assert arg[:9] == 'CATEGORY_'
649665print_(op, arg[9:])
650-elif op in (IN, IN_IGNORE, IN_LOC_IGNORE):
666+elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
651667skip = code[i]
652668print_(op, skip, to=i+skip)
653669dis_(i+1, i+skip)
654670i += skip
655-elif op in (RANGE, RANGE_IGNORE):
671+elif op in (RANGE, RANGE_UNI_IGNORE):
656672lo, hi = code[i: i+2]
657673i += 2
658674print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
@@ -671,7 +687,8 @@ def print_2(*args):
671687print_2(_hex_code(code[i: i + 256//_CODEBITS]))
672688i += 256//_CODEBITS
673689level -= 1
674-elif op in (MARK, GROUPREF, GROUPREF_IGNORE):
690+elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
691+GROUPREF_LOC_IGNORE):
675692arg = code[i]
676693i += 1
677694print_(op, arg)