bpo-31690: Allow the inline flags "a", "L", and "u" to be used as gro… · python/cpython@3557b05

@@ -62,6 +62,12 @@

6262

_ignorecase_fixes = {i: tuple(j for j in t if i != j)

6363

for t in _equivalences for i in t}

646465+

def _combine_flags(flags, add_flags, del_flags,

66+

TYPE_FLAGS=sre_parse.TYPE_FLAGS):

67+

if add_flags & TYPE_FLAGS:

68+

flags &= ~TYPE_FLAGS

69+

return (flags | add_flags) & ~del_flags

70+6571

def _compile(code, pattern, flags):

6672

# internal: compile a (sub)pattern

6773

emit = code.append

@@ -87,15 +93,21 @@ def _compile(code, pattern, flags):

8793

emit(op)

8894

emit(av)

8995

elif flags & SRE_FLAG_LOCALE:

90-

emit(OP_LOC_IGNORE[op])

96+

emit(OP_LOCALE_IGNORE[op])

9197

emit(av)

9298

elif not iscased(av):

9399

emit(op)

94100

emit(av)

95101

else:

96102

lo = tolower(av)

97-

if fixes and lo in fixes:

98-

emit(IN_IGNORE)

103+

if not fixes: # ascii

104+

emit(OP_IGNORE[op])

105+

emit(lo)

106+

elif lo not in fixes:

107+

emit(OP_UNICODE_IGNORE[op])

108+

emit(lo)

109+

else:

110+

emit(IN_UNI_IGNORE)

99111

skip = _len(code); emit(0)

100112

if op is NOT_LITERAL:

101113

emit(NEGATE)

@@ -104,17 +116,16 @@ def _compile(code, pattern, flags):

104116

emit(k)

105117

emit(FAILURE)

106118

code[skip] = _len(code) - skip

107-

else:

108-

emit(OP_IGNORE[op])

109-

emit(lo)

110119

elif op is IN:

111120

charset, hascased = _optimize_charset(av, iscased, tolower, fixes)

112121

if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:

113122

emit(IN_LOC_IGNORE)

114-

elif hascased:

123+

elif not hascased:

124+

emit(IN)

125+

elif not fixes: # ascii

115126

emit(IN_IGNORE)

116127

else:

117-

emit(IN)

128+

emit(IN_UNI_IGNORE)

118129

skip = _len(code); emit(0)

119130

_compile_charset(charset, flags, code)

120131

code[skip] = _len(code) - skip

@@ -153,8 +164,8 @@ def _compile(code, pattern, flags):

153164

if group:

154165

emit(MARK)

155166

emit((group-1)*2)

156-

# _compile_info(code, p, (flags | add_flags) & ~del_flags)

157-

_compile(code, p, (flags | add_flags) & ~del_flags)

167+

# _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))

168+

_compile(code, p, _combine_flags(flags, add_flags, del_flags))

158169

if group:

159170

emit(MARK)

160171

emit((group-1)*2+1)

@@ -210,10 +221,14 @@ def _compile(code, pattern, flags):

210221

av = CH_UNICODE[av]

211222

emit(av)

212223

elif op is GROUPREF:

213-

if flags & SRE_FLAG_IGNORECASE:

214-

emit(OP_IGNORE[op])

215-

else:

224+

if not flags & SRE_FLAG_IGNORECASE:

216225

emit(op)

226+

elif flags & SRE_FLAG_LOCALE:

227+

emit(GROUPREF_LOC_IGNORE)

228+

elif not fixes: # ascii

229+

emit(GROUPREF_IGNORE)

230+

else:

231+

emit(GROUPREF_UNI_IGNORE)

217232

emit(av-1)

218233

elif op is GROUPREF_EXISTS:

219234

emit(op)

@@ -240,7 +255,7 @@ def _compile_charset(charset, flags, code):

240255

pass

241256

elif op is LITERAL:

242257

emit(av)

243-

elif op is RANGE or op is RANGE_IGNORE:

258+

elif op is RANGE or op is RANGE_UNI_IGNORE:

244259

emit(av[0])

245260

emit(av[1])

246261

elif op is CHARSET:

@@ -309,9 +324,9 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):

309324

hascased = True

310325

# There are only two ranges of cased non-BMP characters:

311326

# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),

312-

# and for both ranges RANGE_IGNORE works.

327+

# and for both ranges RANGE_UNI_IGNORE works.

313328

if op is RANGE:

314-

op = RANGE_IGNORE

329+

op = RANGE_UNI_IGNORE

315330

tail.append((op, av))

316331

break

317332

@@ -456,7 +471,7 @@ def _get_literal_prefix(pattern, flags):

456471

prefixappend(av)

457472

elif op is SUBPATTERN:

458473

group, add_flags, del_flags, p = av

459-

flags1 = (flags | add_flags) & ~del_flags

474+

flags1 = _combine_flags(flags, add_flags, del_flags)

460475

if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:

461476

break

462477

prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)

@@ -482,7 +497,7 @@ def _get_charset_prefix(pattern, flags):

482497

if op is not SUBPATTERN:

483498

break

484499

group, add_flags, del_flags, pattern = av

485-

flags = (flags | add_flags) & ~del_flags

500+

flags = _combine_flags(flags, add_flags, del_flags)

486501

if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:

487502

return None

488503

@@ -631,6 +646,7 @@ def print_2(*args):

631646

print_(op)

632647

elif op in (LITERAL, NOT_LITERAL,

633648

LITERAL_IGNORE, NOT_LITERAL_IGNORE,

649+

LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,

634650

LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):

635651

arg = code[i]

636652

i += 1

@@ -647,12 +663,12 @@ def print_2(*args):

647663

arg = str(CHCODES[arg])

648664

assert arg[:9] == 'CATEGORY_'

649665

print_(op, arg[9:])

650-

elif op in (IN, IN_IGNORE, IN_LOC_IGNORE):

666+

elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):

651667

skip = code[i]

652668

print_(op, skip, to=i+skip)

653669

dis_(i+1, i+skip)

654670

i += skip

655-

elif op in (RANGE, RANGE_IGNORE):

671+

elif op in (RANGE, RANGE_UNI_IGNORE):

656672

lo, hi = code[i: i+2]

657673

i += 2

658674

print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))

@@ -671,7 +687,8 @@ def print_2(*args):

671687

print_2(_hex_code(code[i: i + 256//_CODEBITS]))

672688

i += 256//_CODEBITS

673689

level -= 1

674-

elif op in (MARK, GROUPREF, GROUPREF_IGNORE):

690+

elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,

691+

GROUPREF_LOC_IGNORE):

675692

arg = code[i]

676693

i += 1

677694

print_(op, arg)