bpo-29456: Fix bugs in unicodedata.normalize: u1176, u11a7 and u11c3 … · python/cpython@0e2b76e

4 files changed

lines changed

Original file line numberDiff line numberDiff line change

@@ -208,6 +208,19 @@ def test_issue10254(self):

208208

b = 'C\u0338' * 20 + '\xC7'

209209

self.assertEqual(self.db.normalize('NFC', a), b)

210210
211+

def test_issue29456(self):

212+

# Fix #29456

213+

u1176_str_a = '\u1100\u1176\u11a8'

214+

u1176_str_b = '\u1100\u1176\u11a8'

215+

u11a7_str_a = '\u1100\u1175\u11a7'

216+

u11a7_str_b = '\uae30\u11a7'

217+

u11c3_str_a = '\u1100\u1175\u11c3'

218+

u11c3_str_b = '\uae30\u11c3'

219+

self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)

220+

self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)

221+

self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)

222+
223+
211224

def test_east_asian_width(self):

212225

eaw = self.db.east_asian_width

213226

self.assertRaises(TypeError, eaw, b'a')

Original file line numberDiff line numberDiff line change

@@ -1791,6 +1791,7 @@ Jason Yeo

17911791

EungJun Yi

17921792

Bob Yodlowski

17931793

Danny Yoo

1794+

Wonsup Yoon

17941795

Rory Yorke

17951796

George Yoshida

17961797

Kazuhiro Yoshida

Original file line numberDiff line numberDiff line change

@@ -0,0 +1 @@

1+

Fix bugs in hangul normalization: u1176, u11a7 and u11c3

Original file line numberDiff line numberDiff line change

@@ -681,15 +681,19 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)

681681

if (LBase <= code && code < (LBase+LCount) &&

682682

i + 1 < len &&

683683

VBase <= PyUnicode_READ(kind, data, i+1) &&

684-

PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {

684+

PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {

685+

/* check L character is a modern leading consonant (0x1100 ~ 0x1112)

686+

and V character is a modern vowel (0x1161 ~ 0x1175). */

685687

int LIndex, VIndex;

686688

LIndex = code - LBase;

687689

VIndex = PyUnicode_READ(kind, data, i+1) - VBase;

688690

code = SBase + (LIndex*VCount+VIndex)*TCount;

689691

i+=2;

690692

if (i < len &&

691-

TBase <= PyUnicode_READ(kind, data, i) &&

692-

PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {

693+

TBase < PyUnicode_READ(kind, data, i) &&

694+

PyUnicode_READ(kind, data, i) < (TBase+TCount)) {

695+

/* check T character is a modern trailing consonant

696+

(0x11A8 ~ 0x11C2). */

693697

code += PyUnicode_READ(kind, data, i)-TBase;

694698

i++;

695699

}