bpo-41158: IDLE: rewrite the code for handling file encoding (GH-21215) · python/cpython@fe0175f

1-

import codecs

2-

from codecs import BOM_UTF8

31

import os

4-

import re

52

import shlex

63

import sys

74

import tempfile

5+

import tokenize

8697

import tkinter.filedialog as tkFileDialog

108

import tkinter.messagebox as tkMessageBox

@@ -20,49 +18,6 @@

2018

errors = 'surrogateescape'

2119222023-

coding_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)

24-

blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

25-26-

def coding_spec(data):

27-

"""Return the encoding declaration according to PEP 263.

28-29-

When checking encoded data, only the first two lines should be passed

30-

in to avoid a UnicodeDecodeError if the rest of the data is not unicode.

31-

The first two lines would contain the encoding specification.

32-33-

Raise a LookupError if the encoding is declared but unknown.

34-

"""

35-

if isinstance(data, bytes):

36-

# This encoding might be wrong. However, the coding

37-

# spec must be ASCII-only, so any non-ASCII characters

38-

# around here will be ignored. Decoding to Latin-1 should

39-

# never fail (except for memory outage)

40-

lines = data.decode('iso-8859-1')

41-

else:

42-

lines = data

43-

# consider only the first two lines

44-

if '\n' in lines:

45-

lst = lines.split('\n', 2)[:2]

46-

elif '\r' in lines:

47-

lst = lines.split('\r', 2)[:2]

48-

else:

49-

lst = [lines]

50-

for line in lst:

51-

match = coding_re.match(line)

52-

if match is not None:

53-

break

54-

if not blank_re.match(line):

55-

return None

56-

else:

57-

return None

58-

name = match.group(1)

59-

try:

60-

codecs.lookup(name)

61-

except LookupError:

62-

# The standard encoding error does not indicate the encoding

63-

raise LookupError("Unknown encoding: "+name)

64-

return name

65-66216722

class IOBinding:

6823

# One instance per editor Window so methods know which to save, close.

@@ -78,7 +33,7 @@ def __init__(self, editwin):

7833

self.save_as)

7934

self.__id_savecopy = self.text.bind("<<save-copy-of-window-as-file>>",

8035

self.save_a_copy)

81-

self.fileencoding = None

36+

self.fileencoding = 'utf-8'

8237

self.__id_print = self.text.bind("<<print-window>>", self.print_window)

83388439

def close(self):

@@ -165,34 +120,44 @@ def open(self, event=None, editFile=None):

165120

self.text.focus_set()

166121

return "break"

167122168-

eol = r"(\r\n)|\n|\r" # \r\n (Windows), \n (UNIX), or \r (Mac)

169-

eol_re = re.compile(eol)

170123

eol_convention = os.linesep # default

171124172125

def loadfile(self, filename):

173126

try:

174-

# open the file in binary mode so that we can handle

175-

# end-of-line convention ourselves.

176-

with open(filename, 'rb') as f:

177-

two_lines = f.readline() + f.readline()

178-

f.seek(0)

179-

bytes = f.read()

180-

except OSError as msg:

181-

tkMessageBox.showerror("I/O Error", str(msg), parent=self.text)

127+

try:

128+

with tokenize.open(filename) as f:

129+

chars = f.read()

130+

fileencoding = f.encoding

131+

eol_convention = f.newlines

132+

converted = False

133+

except (UnicodeDecodeError, SyntaxError):

134+

# Wait for the editor window to appear

135+

self.editwin.text.update()

136+

enc = askstring(

137+

"Specify file encoding",

138+

"The file's encoding is invalid for Python 3.x.\n"

139+

"IDLE will convert it to UTF-8.\n"

140+

"What is the current encoding of the file?",

141+

initialvalue='utf-8',

142+

parent=self.editwin.text)

143+

with open(filename, encoding=enc) as f:

144+

chars = f.read()

145+

fileencoding = f.encoding

146+

eol_convention = f.newlines

147+

converted = True

148+

except OSError as err:

149+

tkMessageBox.showerror("I/O Error", str(err), parent=self.text)

182150

return False

183-

chars, converted = self._decode(two_lines, bytes)

184-

if chars is None:

151+

except UnicodeDecodeError:

185152

tkMessageBox.showerror("Decoding Error",

186153

"File %s\nFailed to Decode" % filename,

187154

parent=self.text)

188155

return False

189-

# We now convert all end-of-lines to '\n's

190-

firsteol = self.eol_re.search(chars)

191-

if firsteol:

192-

self.eol_convention = firsteol.group(0)

193-

chars = self.eol_re.sub(r"\n", chars)

156+194157

self.text.delete("1.0", "end")

195158

self.set_filename(None)

159+

self.fileencoding = fileencoding

160+

self.eol_convention = eol_convention

196161

self.text.insert("1.0", chars)

197162

self.reset_undo()

198163

self.set_filename(filename)

@@ -205,74 +170,6 @@ def loadfile(self, filename):

205170

self.updaterecentfileslist(filename)

206171

return True

207172208-

def _decode(self, two_lines, bytes):

209-

"Create a Unicode string."

210-

chars = None

211-

# Check presence of a UTF-8 signature first

212-

if bytes.startswith(BOM_UTF8):

213-

try:

214-

chars = bytes[3:].decode("utf-8")

215-

except UnicodeDecodeError:

216-

# has UTF-8 signature, but fails to decode...

217-

return None, False

218-

else:

219-

# Indicates that this file originally had a BOM

220-

self.fileencoding = 'BOM'

221-

return chars, False

222-

# Next look for coding specification

223-

try:

224-

enc = coding_spec(two_lines)

225-

except LookupError as name:

226-

tkMessageBox.showerror(

227-

title="Error loading the file",

228-

message="The encoding '%s' is not known to this Python "\

229-

"installation. The file may not display correctly" % name,

230-

parent = self.text)

231-

enc = None

232-

except UnicodeDecodeError:

233-

return None, False

234-

if enc:

235-

try:

236-

chars = str(bytes, enc)

237-

self.fileencoding = enc

238-

return chars, False

239-

except UnicodeDecodeError:

240-

pass

241-

# Try ascii:

242-

try:

243-

chars = str(bytes, 'ascii')

244-

self.fileencoding = None

245-

return chars, False

246-

except UnicodeDecodeError:

247-

pass

248-

# Try utf-8:

249-

try:

250-

chars = str(bytes, 'utf-8')

251-

self.fileencoding = 'utf-8'

252-

return chars, False

253-

except UnicodeDecodeError:

254-

pass

255-

# Finally, try the locale's encoding. This is deprecated;

256-

# the user should declare a non-ASCII encoding

257-

try:

258-

# Wait for the editor window to appear

259-

self.editwin.text.update()

260-

enc = askstring(

261-

"Specify file encoding",

262-

"The file's encoding is invalid for Python 3.x.\n"

263-

"IDLE will convert it to UTF-8.\n"

264-

"What is the current encoding of the file?",

265-

initialvalue = encoding,

266-

parent = self.editwin.text)

267-268-

if enc:

269-

chars = str(bytes, enc)

270-

self.fileencoding = None

271-

return chars, True

272-

except (UnicodeDecodeError, LookupError):

273-

pass

274-

return None, False # None on failure

275-276173

def maybesave(self):

277174

if self.get_saved():

278175

return "yes"

@@ -360,38 +257,30 @@ def encode(self, chars):

360257

# text to us. Don't try to guess further.

361258

return chars

362259

# Preserve a BOM that might have been present on opening

363-

if self.fileencoding == 'BOM':

364-

return BOM_UTF8 + chars.encode("utf-8")

260+

if self.fileencoding == 'utf-8-sig':

261+

return chars.encode('utf-8-sig')

365262

# See whether there is anything non-ASCII in it.

366263

# If not, no need to figure out the encoding.

367264

try:

368265

return chars.encode('ascii')

369-

except UnicodeError:

266+

except UnicodeEncodeError:

370267

pass

371268

# Check if there is an encoding declared

372269

try:

373-

# a string, let coding_spec slice it to the first two lines

374-

enc = coding_spec(chars)

375-

failed = None

376-

except LookupError as msg:

377-

failed = msg

378-

enc = None

379-

else:

380-

if not enc:

381-

# PEP 3120: default source encoding is UTF-8

382-

enc = 'utf-8'

383-

if enc:

384-

try:

385-

return chars.encode(enc)

386-

except UnicodeError:

387-

failed = "Invalid encoding '%s'" % enc

270+

encoded = chars.encode('ascii', 'replace')

271+

enc, _ = tokenize.detect_encoding(io.BytesIO(encoded).readline)

272+

return chars.encode(enc)

273+

except SyntaxError as err:

274+

failed = str(err)

275+

except UnicodeEncodeError:

276+

failed = "Invalid encoding '%s'" % enc

388277

tkMessageBox.showerror(

389278

"I/O Error",

390279

"%s.\nSaving as UTF-8" % failed,

391-

parent = self.text)

280+

parent=self.text)

392281

# Fallback: save as UTF-8, with BOM - ignoring the incorrect

393282

# declared encoding

394-

return BOM_UTF8 + chars.encode("utf-8")

283+

return chars.encode('utf-8-sig')

395284396285

def print_window(self, event):

397286

confirm = tkMessageBox.askokcancel(