bpo-36216: Add check for characters in netloc that normalize to separ… · python/cpython@daad2c4
@@ -391,6 +391,21 @@ def _splitnetloc(url, start=0):
391391delim = min(delim, wdelim) # use earliest delim position
392392return url[start:delim], url[delim:] # return (domain, rest)
393393394+def _checknetloc(netloc):
395+if not netloc or netloc.isascii():
396+return
397+# looking for characters like \u2100 that expand to 'a/c'
398+# IDNA uses NFKC equivalence, so normalize for this check
399+import unicodedata
400+netloc2 = unicodedata.normalize('NFKC', netloc)
401+if netloc == netloc2:
402+return
403+_, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay
404+for c in '/?#@:':
405+if c in netloc2:
406+raise ValueError("netloc '" + netloc2 + "' contains invalid " +
407+"characters under NFKC normalization")
408+394409def urlsplit(url, scheme='', allow_fragments=True):
395410"""Parse a URL into 5 components:
396411 <scheme>://<netloc>/<path>?<query>#<fragment>
@@ -419,6 +434,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
419434url, fragment = url.split('#', 1)
420435if '?' in url:
421436url, query = url.split('?', 1)
437+_checknetloc(netloc)
422438v = SplitResult('http', netloc, url, query, fragment)
423439_parse_cache[key] = v
424440return _coerce_result(v)
@@ -442,6 +458,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
442458url, fragment = url.split('#', 1)
443459if '?' in url:
444460url, query = url.split('?', 1)
461+_checknetloc(netloc)
445462v = SplitResult(scheme, netloc, url, query, fragment)
446463_parse_cache[key] = v
447464return _coerce_result(v)