bpo-36216: Add check for characters in netloc that normalize to separ… · python/cpython@16e6f7d

@@ -396,6 +396,21 @@ def _splitnetloc(url, start=0):

396396

delim = min(delim, wdelim) # use earliest delim position

397397

return url[start:delim], url[delim:] # return (domain, rest)

398398399+

def _checknetloc(netloc):

400+

if not netloc or netloc.isascii():

401+

return

402+

# looking for characters like \u2100 that expand to 'a/c'

403+

# IDNA uses NFKC equivalence, so normalize for this check

404+

import unicodedata

405+

netloc2 = unicodedata.normalize('NFKC', netloc)

406+

if netloc == netloc2:

407+

return

408+

_, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay

409+

for c in '/?#@:':

410+

if c in netloc2:

411+

raise ValueError("netloc '" + netloc2 + "' contains invalid " +

412+

"characters under NFKC normalization")

413+399414

def urlsplit(url, scheme='', allow_fragments=True):

400415

"""Parse a URL into 5 components:

401416

<scheme>://<netloc>/<path>?<query>#<fragment>

@@ -424,6 +439,7 @@ def urlsplit(url, scheme='', allow_fragments=True):

424439

url, fragment = url.split('#', 1)

425440

if '?' in url:

426441

url, query = url.split('?', 1)

442+

_checknetloc(netloc)

427443

v = SplitResult('http', netloc, url, query, fragment)

428444

_parse_cache[key] = v

429445

return _coerce_result(v)

@@ -447,6 +463,7 @@ def urlsplit(url, scheme='', allow_fragments=True):

447463

url, fragment = url.split('#', 1)

448464

if '?' in url:

449465

url, query = url.split('?', 1)

466+

_checknetloc(netloc)

450467

v = SplitResult(scheme, netloc, url, query, fragment)

451468

_parse_cache[key] = v

452469

return _coerce_result(v)