bpo-41158: IDLE: rewrite the code for handling file encoding (GH-21215) · python/cpython@fe0175f
1-import codecs
2-from codecs import BOM_UTF8
31import os
4-import re
52import shlex
63import sys
74import tempfile
5+import tokenize
8697import tkinter.filedialog as tkFileDialog
108import tkinter.messagebox as tkMessageBox
@@ -20,49 +18,6 @@
2018errors = 'surrogateescape'
2119222023-coding_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
24-blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
25-26-def coding_spec(data):
27-"""Return the encoding declaration according to PEP 263.
28-29- When checking encoded data, only the first two lines should be passed
30- in to avoid a UnicodeDecodeError if the rest of the data is not unicode.
31- The first two lines would contain the encoding specification.
32-33- Raise a LookupError if the encoding is declared but unknown.
34- """
35-if isinstance(data, bytes):
36-# This encoding might be wrong. However, the coding
37-# spec must be ASCII-only, so any non-ASCII characters
38-# around here will be ignored. Decoding to Latin-1 should
39-# never fail (except for memory outage)
40-lines = data.decode('iso-8859-1')
41-else:
42-lines = data
43-# consider only the first two lines
44-if '\n' in lines:
45-lst = lines.split('\n', 2)[:2]
46-elif '\r' in lines:
47-lst = lines.split('\r', 2)[:2]
48-else:
49-lst = [lines]
50-for line in lst:
51-match = coding_re.match(line)
52-if match is not None:
53-break
54-if not blank_re.match(line):
55-return None
56-else:
57-return None
58-name = match.group(1)
59-try:
60-codecs.lookup(name)
61-except LookupError:
62-# The standard encoding error does not indicate the encoding
63-raise LookupError("Unknown encoding: "+name)
64-return name
65-66216722class IOBinding:
6823# One instance per editor Window so methods know which to save, close.
@@ -78,7 +33,7 @@ def __init__(self, editwin):
7833self.save_as)
7934self.__id_savecopy = self.text.bind("<<save-copy-of-window-as-file>>",
8035self.save_a_copy)
81-self.fileencoding = None
36+self.fileencoding = 'utf-8'
8237self.__id_print = self.text.bind("<<print-window>>", self.print_window)
83388439def close(self):
@@ -165,34 +120,44 @@ def open(self, event=None, editFile=None):
165120self.text.focus_set()
166121return "break"
167122168-eol = r"(\r\n)|\n|\r" # \r\n (Windows), \n (UNIX), or \r (Mac)
169-eol_re = re.compile(eol)
170123eol_convention = os.linesep # default
171124172125def loadfile(self, filename):
173126try:
174-# open the file in binary mode so that we can handle
175-# end-of-line convention ourselves.
176-with open(filename, 'rb') as f:
177-two_lines = f.readline() + f.readline()
178-f.seek(0)
179-bytes = f.read()
180-except OSError as msg:
181-tkMessageBox.showerror("I/O Error", str(msg), parent=self.text)
127+try:
128+with tokenize.open(filename) as f:
129+chars = f.read()
130+fileencoding = f.encoding
131+eol_convention = f.newlines
132+converted = False
133+except (UnicodeDecodeError, SyntaxError):
134+# Wait for the editor window to appear
135+self.editwin.text.update()
136+enc = askstring(
137+"Specify file encoding",
138+"The file's encoding is invalid for Python 3.x.\n"
139+"IDLE will convert it to UTF-8.\n"
140+"What is the current encoding of the file?",
141+initialvalue='utf-8',
142+parent=self.editwin.text)
143+with open(filename, encoding=enc) as f:
144+chars = f.read()
145+fileencoding = f.encoding
146+eol_convention = f.newlines
147+converted = True
148+except OSError as err:
149+tkMessageBox.showerror("I/O Error", str(err), parent=self.text)
182150return False
183-chars, converted = self._decode(two_lines, bytes)
184-if chars is None:
151+except UnicodeDecodeError:
185152tkMessageBox.showerror("Decoding Error",
186153"File %s\nFailed to Decode" % filename,
187154parent=self.text)
188155return False
189-# We now convert all end-of-lines to '\n's
190-firsteol = self.eol_re.search(chars)
191-if firsteol:
192-self.eol_convention = firsteol.group(0)
193-chars = self.eol_re.sub(r"\n", chars)
156+194157self.text.delete("1.0", "end")
195158self.set_filename(None)
159+self.fileencoding = fileencoding
160+self.eol_convention = eol_convention
196161self.text.insert("1.0", chars)
197162self.reset_undo()
198163self.set_filename(filename)
@@ -205,74 +170,6 @@ def loadfile(self, filename):
205170self.updaterecentfileslist(filename)
206171return True
207172208-def _decode(self, two_lines, bytes):
209-"Create a Unicode string."
210-chars = None
211-# Check presence of a UTF-8 signature first
212-if bytes.startswith(BOM_UTF8):
213-try:
214-chars = bytes[3:].decode("utf-8")
215-except UnicodeDecodeError:
216-# has UTF-8 signature, but fails to decode...
217-return None, False
218-else:
219-# Indicates that this file originally had a BOM
220-self.fileencoding = 'BOM'
221-return chars, False
222-# Next look for coding specification
223-try:
224-enc = coding_spec(two_lines)
225-except LookupError as name:
226-tkMessageBox.showerror(
227-title="Error loading the file",
228-message="The encoding '%s' is not known to this Python "\
229-"installation. The file may not display correctly" % name,
230-parent = self.text)
231-enc = None
232-except UnicodeDecodeError:
233-return None, False
234-if enc:
235-try:
236-chars = str(bytes, enc)
237-self.fileencoding = enc
238-return chars, False
239-except UnicodeDecodeError:
240-pass
241-# Try ascii:
242-try:
243-chars = str(bytes, 'ascii')
244-self.fileencoding = None
245-return chars, False
246-except UnicodeDecodeError:
247-pass
248-# Try utf-8:
249-try:
250-chars = str(bytes, 'utf-8')
251-self.fileencoding = 'utf-8'
252-return chars, False
253-except UnicodeDecodeError:
254-pass
255-# Finally, try the locale's encoding. This is deprecated;
256-# the user should declare a non-ASCII encoding
257-try:
258-# Wait for the editor window to appear
259-self.editwin.text.update()
260-enc = askstring(
261-"Specify file encoding",
262-"The file's encoding is invalid for Python 3.x.\n"
263-"IDLE will convert it to UTF-8.\n"
264-"What is the current encoding of the file?",
265-initialvalue = encoding,
266-parent = self.editwin.text)
267-268-if enc:
269-chars = str(bytes, enc)
270-self.fileencoding = None
271-return chars, True
272-except (UnicodeDecodeError, LookupError):
273-pass
274-return None, False # None on failure
275-276173def maybesave(self):
277174if self.get_saved():
278175return "yes"
@@ -360,38 +257,30 @@ def encode(self, chars):
360257# text to us. Don't try to guess further.
361258return chars
362259# Preserve a BOM that might have been present on opening
363-if self.fileencoding == 'BOM':
364-return BOM_UTF8 + chars.encode("utf-8")
260+if self.fileencoding == 'utf-8-sig':
261+return chars.encode('utf-8-sig')
365262# See whether there is anything non-ASCII in it.
366263# If not, no need to figure out the encoding.
367264try:
368265return chars.encode('ascii')
369-except UnicodeError:
266+except UnicodeEncodeError:
370267pass
371268# Check if there is an encoding declared
372269try:
373-# a string, let coding_spec slice it to the first two lines
374-enc = coding_spec(chars)
375-failed = None
376-except LookupError as msg:
377-failed = msg
378-enc = None
379-else:
380-if not enc:
381-# PEP 3120: default source encoding is UTF-8
382-enc = 'utf-8'
383-if enc:
384-try:
385-return chars.encode(enc)
386-except UnicodeError:
387-failed = "Invalid encoding '%s'" % enc
270+encoded = chars.encode('ascii', 'replace')
271+enc, _ = tokenize.detect_encoding(io.BytesIO(encoded).readline)
272+return chars.encode(enc)
273+except SyntaxError as err:
274+failed = str(err)
275+except UnicodeEncodeError:
276+failed = "Invalid encoding '%s'" % enc
388277tkMessageBox.showerror(
389278"I/O Error",
390279"%s.\nSaving as UTF-8" % failed,
391-parent = self.text)
280+parent=self.text)
392281# Fallback: save as UTF-8, with BOM - ignoring the incorrect
393282# declared encoding
394-return BOM_UTF8 + chars.encode("utf-8")
283+return chars.encode('utf-8-sig')
395284396285def print_window(self, event):
397286confirm = tkMessageBox.askokcancel(