bpo-41158: IDLE: rewrite the code for handling file encoding (GH-21215)

bpo-41158: IDLE: rewrite the code for handling file encoding (GH-21215) · python/cpython@fe0175f

1-import codecs2-from codecs import BOM_UTF831import os4-import re52import shlex63import sys74import tempfile5+import tokenize8697import tkinter.filedialog as tkFileDialog108import tkinter.messagebox as tkMessageBox

@@ -20,49 +18,6 @@

2018errors = 'surrogateescape'2119222023-coding_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)24-blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)25-26-def coding_spec(data):27-"""Return the encoding declaration according to PEP 263.28-29- When checking encoded data, only the first two lines should be passed30- in to avoid a UnicodeDecodeError if the rest of the data is not unicode.31- The first two lines would contain the encoding specification.32-33- Raise a LookupError if the encoding is declared but unknown.34- """35-if isinstance(data, bytes):36-# This encoding might be wrong. However, the coding37-# spec must be ASCII-only, so any non-ASCII characters38-# around here will be ignored. Decoding to Latin-1 should39-# never fail (except for memory outage)40-lines = data.decode('iso-8859-1')41-else:42-lines = data43-# consider only the first two lines44-if '\n' in lines:45-lst = lines.split('\n', 2)[:2]46-elif '\r' in lines:47-lst = lines.split('\r', 2)[:2]48-else:49-lst = [lines]50-for line in lst:51-match = coding_re.match(line)52-if match is not None:53-break54-if not blank_re.match(line):55-return None56-else:57-return None58-name = match.group(1)59-try:60-codecs.lookup(name)61-except LookupError:62-# The standard encoding error does not indicate the encoding63-raise LookupError("Unknown encoding: "+name)64-return name65-66216722class IOBinding:6823# One instance per editor Window so methods know which to save, close.

@@ -78,7 +33,7 @@ def __init__(self, editwin):

7833self.save_as)7934self.__id_savecopy = self.text.bind("<<save-copy-of-window-as-file>>",8035self.save_a_copy)81-self.fileencoding = None36+self.fileencoding = 'utf-8'8237self.__id_print = self.text.bind("<<print-window>>", self.print_window)83388439def close(self):

@@ -165,34 +120,44 @@ def open(self, event=None, editFile=None):

165120self.text.focus_set()166121return "break"167122168-eol = r"(\r\n)|\n|\r" # \r\n (Windows), \n (UNIX), or \r (Mac)169-eol_re = re.compile(eol)170123eol_convention = os.linesep # default171124172125def loadfile(self, filename):173126try:174-# open the file in binary mode so that we can handle175-# end-of-line convention ourselves.176-with open(filename, 'rb') as f:177-two_lines = f.readline() + f.readline()178-f.seek(0)179-bytes = f.read()180-except OSError as msg:181-tkMessageBox.showerror("I/O Error", str(msg), parent=self.text)127+try:128+with tokenize.open(filename) as f:129+chars = f.read()130+fileencoding = f.encoding131+eol_convention = f.newlines132+converted = False133+except (UnicodeDecodeError, SyntaxError):134+# Wait for the editor window to appear135+self.editwin.text.update()136+enc = askstring(137+"Specify file encoding",138+"The file's encoding is invalid for Python 3.x.\n"139+"IDLE will convert it to UTF-8.\n"140+"What is the current encoding of the file?",141+initialvalue='utf-8',142+parent=self.editwin.text)143+with open(filename, encoding=enc) as f:144+chars = f.read()145+fileencoding = f.encoding146+eol_convention = f.newlines147+converted = True148+except OSError as err:149+tkMessageBox.showerror("I/O Error", str(err), parent=self.text)182150return False183-chars, converted = self._decode(two_lines, bytes)184-if chars is None:151+except UnicodeDecodeError:185152tkMessageBox.showerror("Decoding Error",186153"File %s\nFailed to Decode" % filename,187154parent=self.text)188155return False189-# We now convert all end-of-lines to '\n's190-firsteol = self.eol_re.search(chars)191-if firsteol:192-self.eol_convention = firsteol.group(0)193-chars = self.eol_re.sub(r"\n", chars)156+194157self.text.delete("1.0", "end")195158self.set_filename(None)159+self.fileencoding = fileencoding160+self.eol_convention = eol_convention196161self.text.insert("1.0", chars)197162self.reset_undo()198163self.set_filename(filename)

@@ -205,74 +170,6 @@ def loadfile(self, filename):

205170self.updaterecentfileslist(filename)206171return True207172208-def _decode(self, two_lines, bytes):209-"Create a Unicode string."210-chars = None211-# Check presence of a UTF-8 signature first212-if bytes.startswith(BOM_UTF8):213-try:214-chars = bytes[3:].decode("utf-8")215-except UnicodeDecodeError:216-# has UTF-8 signature, but fails to decode...217-return None, False218-else:219-# Indicates that this file originally had a BOM220-self.fileencoding = 'BOM'221-return chars, False222-# Next look for coding specification223-try:224-enc = coding_spec(two_lines)225-except LookupError as name:226-tkMessageBox.showerror(227-title="Error loading the file",228-message="The encoding '%s' is not known to this Python "\229-"installation. The file may not display correctly" % name,230-parent = self.text)231-enc = None232-except UnicodeDecodeError:233-return None, False234-if enc:235-try:236-chars = str(bytes, enc)237-self.fileencoding = enc238-return chars, False239-except UnicodeDecodeError:240-pass241-# Try ascii:242-try:243-chars = str(bytes, 'ascii')244-self.fileencoding = None245-return chars, False246-except UnicodeDecodeError:247-pass248-# Try utf-8:249-try:250-chars = str(bytes, 'utf-8')251-self.fileencoding = 'utf-8'252-return chars, False253-except UnicodeDecodeError:254-pass255-# Finally, try the locale's encoding. This is deprecated;256-# the user should declare a non-ASCII encoding257-try:258-# Wait for the editor window to appear259-self.editwin.text.update()260-enc = askstring(261-"Specify file encoding",262-"The file's encoding is invalid for Python 3.x.\n"263-"IDLE will convert it to UTF-8.\n"264-"What is the current encoding of the file?",265-initialvalue = encoding,266-parent = self.editwin.text)267-268-if enc:269-chars = str(bytes, enc)270-self.fileencoding = None271-return chars, True272-except (UnicodeDecodeError, LookupError):273-pass274-return None, False # None on failure275-276173def maybesave(self):277174if self.get_saved():278175return "yes"

@@ -360,38 +257,30 @@ def encode(self, chars):

360257# text to us. Don't try to guess further.361258return chars362259# Preserve a BOM that might have been present on opening363-if self.fileencoding == 'BOM':364-return BOM_UTF8 + chars.encode("utf-8")260+if self.fileencoding == 'utf-8-sig':261+return chars.encode('utf-8-sig')365262# See whether there is anything non-ASCII in it.366263# If not, no need to figure out the encoding.367264try:368265return chars.encode('ascii')369-except UnicodeError:266+except UnicodeEncodeError:370267pass371268# Check if there is an encoding declared372269try:373-# a string, let coding_spec slice it to the first two lines374-enc = coding_spec(chars)375-failed = None376-except LookupError as msg:377-failed = msg378-enc = None379-else:380-if not enc:381-# PEP 3120: default source encoding is UTF-8382-enc = 'utf-8'383-if enc:384-try:385-return chars.encode(enc)386-except UnicodeError:387-failed = "Invalid encoding '%s'" % enc270+encoded = chars.encode('ascii', 'replace')271+enc, _ = tokenize.detect_encoding(io.BytesIO(encoded).readline)272+return chars.encode(enc)273+except SyntaxError as err:274+failed = str(err)275+except UnicodeEncodeError:276+failed = "Invalid encoding '%s'" % enc388277tkMessageBox.showerror(389278"I/O Error",390279"%s.\nSaving as UTF-8" % failed,391-parent = self.text)280+parent=self.text)392281# Fallback: save as UTF-8, with BOM - ignoring the incorrect393282# declared encoding394-return BOM_UTF8 + chars.encode("utf-8")283+return chars.encode('utf-8-sig')395284396285def print_window(self, event):397286confirm = tkMessageBox.askokcancel(