bpo-36216: Add check for characters in netloc that normalize to separ… · python/cpython@daad2c4

@@ -391,6 +391,21 @@ def _splitnetloc(url, start=0):

391391

delim = min(delim, wdelim) # use earliest delim position

392392

return url[start:delim], url[delim:] # return (domain, rest)

393393394+

def _checknetloc(netloc):

395+

if not netloc or netloc.isascii():

396+

return

397+

# looking for characters like \u2100 that expand to 'a/c'

398+

# IDNA uses NFKC equivalence, so normalize for this check

399+

import unicodedata

400+

netloc2 = unicodedata.normalize('NFKC', netloc)

401+

if netloc == netloc2:

402+

return

403+

_, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay

404+

for c in '/?#@:':

405+

if c in netloc2:

406+

raise ValueError("netloc '" + netloc2 + "' contains invalid " +

407+

"characters under NFKC normalization")

408+394409

def urlsplit(url, scheme='', allow_fragments=True):

395410

"""Parse a URL into 5 components:

396411

<scheme>://<netloc>/<path>?<query>#<fragment>

@@ -419,6 +434,7 @@ def urlsplit(url, scheme='', allow_fragments=True):

419434

url, fragment = url.split('#', 1)

420435

if '?' in url:

421436

url, query = url.split('?', 1)

437+

_checknetloc(netloc)

422438

v = SplitResult('http', netloc, url, query, fragment)

423439

_parse_cache[key] = v

424440

return _coerce_result(v)

@@ -442,6 +458,7 @@ def urlsplit(url, scheme='', allow_fragments=True):

442458

url, fragment = url.split('#', 1)

443459

if '?' in url:

444460

url, query = url.split('?', 1)

461+

_checknetloc(netloc)

445462

v = SplitResult(scheme, netloc, url, query, fragment)

446463

_parse_cache[key] = v

447464

return _coerce_result(v)