main/python2/CVE-2019-9636.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155

From e37ef41289b77e0f0bb9a6aedb0360664c55bdd5 Mon Sep 17 00:00:00 2001
From: Steve Dower <steve.dower@microsoft.com>
Date: Thu, 7 Mar 2019 09:08:45 -0800
Subject: [PATCH] bpo-36216: Add check for characters in netloc that normalize
 to separators (GH-12201)

---
 Doc/library/urlparse.rst                      | 20 ++++++++++++++++
 Lib/test/test_urlparse.py                     | 24 +++++++++++++++++++
 Lib/urlparse.py                               | 17 +++++++++++++
 .../2019-03-06-09-38-40.bpo-36216.6q1m4a.rst  |  3 +++
 4 files changed, 64 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Security/2019-03-06-09-38-40.bpo-36216.6q1m4a.rst

diff --git a/Doc/library/urlparse.rst b/Doc/library/urlparse.rst
index 22249da54fbb..0989c88c3022 100644
--- a/Doc/library/urlparse.rst
+++ b/Doc/library/urlparse.rst
@@ -119,12 +119,22 @@ The :mod:`urlparse` module defines the following functions:
    See section :ref:`urlparse-result-object` for more information on the result
    object.
 
+   Characters in the :attr:`netloc` attribute that decompose under NFKC
+   normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
+   ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
+   decomposed before parsing, or is not a Unicode string, no error will be
+   raised.
+
    .. versionchanged:: 2.5
       Added attributes to return value.
 
    .. versionchanged:: 2.7
       Added IPv6 URL parsing capabilities.
 
+   .. versionchanged:: 2.7.17
+      Characters that affect netloc parsing under NFKC normalization will
+      now raise :exc:`ValueError`.
+
 
 .. function:: parse_qs(qs[, keep_blank_values[, strict_parsing[, max_num_fields]]])
 
@@ -232,11 +242,21 @@ The :mod:`urlparse` module defines the following functions:
    See section :ref:`urlparse-result-object` for more information on the result
    object.
 
+   Characters in the :attr:`netloc` attribute that decompose under NFKC
+   normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
+   ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
+   decomposed before parsing, or is not a Unicode string, no error will be
+   raised.
+
    .. versionadded:: 2.2
 
    .. versionchanged:: 2.5
       Added attributes to return value.
 
+   .. versionchanged:: 2.7.17
+      Characters that affect netloc parsing under NFKC normalization will
+      now raise :exc:`ValueError`.
+
 
 .. function:: urlunsplit(parts)
 
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py
index 4e1ded73c266..73b0228ea8e3 100644
--- a/Lib/test/test_urlparse.py
+++ b/Lib/test/test_urlparse.py
@@ -1,4 +1,6 @@
 from test import test_support
+import sys
+import unicodedata
 import unittest
 import urlparse
 
@@ -624,6 +626,28 @@ def test_portseparator(self):
         self.assertEqual(urlparse.urlparse("http://www.python.org:80"),
                 ('http','www.python.org:80','','','',''))
 
+    def test_urlsplit_normalization(self):
+        # Certain characters should never occur in the netloc,
+        # including under normalization.
+        # Ensure that ALL of them are detected and cause an error
+        illegal_chars = u'/:#?@'
+        hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars}
+        denorm_chars = [
+            c for c in map(unichr, range(128, sys.maxunicode))
+            if (hex_chars & set(unicodedata.decomposition(c).split()))
+            and c not in illegal_chars
+        ]
+        # Sanity check that we found at least one such character
+        self.assertIn(u'\u2100', denorm_chars)
+        self.assertIn(u'\uFF03', denorm_chars)
+
+        for scheme in [u"http", u"https", u"ftp"]:
+            for c in denorm_chars:
+                url = u"{}://netloc{}false.netloc/path".format(scheme, c)
+                print "Checking %r" % url
+                with self.assertRaises(ValueError):
+                    urlparse.urlsplit(url)
+
 def test_main():
     test_support.run_unittest(UrlParseTestCase)
 
diff --git a/Lib/urlparse.py b/Lib/urlparse.py
index f7c2b032b097..54eda08651ab 100644
--- a/Lib/urlparse.py
+++ b/Lib/urlparse.py
@@ -165,6 +165,21 @@ def _splitnetloc(url, start=0):
             delim = min(delim, wdelim)     # use earliest delim position
     return url[start:delim], url[delim:]   # return (domain, rest)
 
+def _checknetloc(netloc):
+    if not netloc or not isinstance(netloc, unicode):
+        return
+    # looking for characters like \u2100 that expand to 'a/c'
+    # IDNA uses NFKC equivalence, so normalize for this check
+    import unicodedata
+    netloc2 = unicodedata.normalize('NFKC', netloc)
+    if netloc == netloc2:
+        return
+    _, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay
+    for c in '/?#@:':
+        if c in netloc2:
+            raise ValueError("netloc '" + netloc2 + "' contains invalid " +
+                             "characters under NFKC normalization")
+
 def urlsplit(url, scheme='', allow_fragments=True):
     """Parse a URL into 5 components:
     <scheme>://<netloc>/<path>?<query>#<fragment>
@@ -193,6 +208,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
                 url, fragment = url.split('#', 1)
             if '?' in url:
                 url, query = url.split('?', 1)
+            _checknetloc(netloc)
             v = SplitResult(scheme, netloc, url, query, fragment)
             _parse_cache[key] = v
             return v
@@ -216,6 +232,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
         url, fragment = url.split('#', 1)
     if '?' in url:
         url, query = url.split('?', 1)
+    _checknetloc(netloc)
     v = SplitResult(scheme, netloc, url, query, fragment)
     _parse_cache[key] = v
     return v
diff --git a/Misc/NEWS.d/next/Security/2019-03-06-09-38-40.bpo-36216.6q1m4a.rst b/Misc/NEWS.d/next/Security/2019-03-06-09-38-40.bpo-36216.6q1m4a.rst
new file mode 100644
index 000000000000..1e1ad92c6feb
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2019-03-06-09-38-40.bpo-36216.6q1m4a.rst
@@ -0,0 +1,3 @@
+Changes urlsplit() to raise ValueError when the URL contains characters that
+decompose under IDNA encoding (NFKC-normalization) into characters that
+affect how the URL is parsed.
\ No newline at end of file