bpo-36216: Add check for characters in netloc that normalize to separ… · python/cpython@16e6f7d
@@ -396,6 +396,21 @@ def _splitnetloc(url, start=0):
396396delim = min(delim, wdelim) # use earliest delim position
397397return url[start:delim], url[delim:] # return (domain, rest)
398398399+def _checknetloc(netloc):
400+if not netloc or netloc.isascii():
401+return
402+# looking for characters like \u2100 that expand to 'a/c'
403+# IDNA uses NFKC equivalence, so normalize for this check
404+import unicodedata
405+netloc2 = unicodedata.normalize('NFKC', netloc)
406+if netloc == netloc2:
407+return
408+_, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay
409+for c in '/?#@:':
410+if c in netloc2:
411+raise ValueError("netloc '" + netloc2 + "' contains invalid " +
412+"characters under NFKC normalization")
413+399414def urlsplit(url, scheme='', allow_fragments=True):
400415"""Parse a URL into 5 components:
401416 <scheme>://<netloc>/<path>?<query>#<fragment>
@@ -424,6 +439,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
424439url, fragment = url.split('#', 1)
425440if '?' in url:
426441url, query = url.split('?', 1)
442+_checknetloc(netloc)
427443v = SplitResult('http', netloc, url, query, fragment)
428444_parse_cache[key] = v
429445return _coerce_result(v)
@@ -447,6 +463,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
447463url, fragment = url.split('#', 1)
448464if '?' in url:
449465url, query = url.split('?', 1)
466+_checknetloc(netloc)
450467v = SplitResult(scheme, netloc, url, query, fragment)
451468_parse_cache[key] = v
452469return _coerce_result(v)