bpo-24665: double-width CJK chars support for textwrap by fgallaire · Pull Request #89

bpo-24665: double-width CJK chars support for textwrap by fgallaire · Pull Request #89 · python/cpython

Expand Up @@ -5,9 +5,10 @@ # Copyright (C) 2002, 2003 Python Software Foundation. # Written by Greg Ward <gward@python.net>
import re import re, unicodedata
__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten'] __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten', 'cjkwide', 'cjklen', 'cjkslices']
# Hardcode the recognized whitespace characters to the US-ASCII # whitespace characters. The main reason for doing this is that Expand All @@ -26,6 +27,8 @@ class TextWrapper: width (default: 70) the maximum width of wrapped lines (unless break_long_words is false) cjk (default: False) Handle double-width CJK chars. initial_indent (default: "") string that will be prepended to the first line of wrapped output. Counts towards the line's width. Expand Down Expand Up @@ -114,6 +117,7 @@ class TextWrapper:
def __init__(self, width=70, cjk=False, initial_indent="", subsequent_indent="", expand_tabs=True, Expand All @@ -127,6 +131,7 @@ def __init__(self, max_lines=None, placeholder=' [...]'): self.width = width self.cjk = cjk self.initial_indent = initial_indent self.subsequent_indent = subsequent_indent self.expand_tabs = expand_tabs Expand All @@ -139,6 +144,7 @@ def __init__(self, self.max_lines = max_lines self.placeholder = placeholder
self.len = cjklen if self.cjk else len
# -- Private methods ----------------------------------------------- # (possibly useful for subclasses to override) Expand Down Expand Up @@ -215,8 +221,13 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): # If we're allowed to break long words, then do so: put as much # of the next chunk onto the current line as will fit. if self.break_long_words: cur_line.append(reversed_chunks[-1][:space_left]) reversed_chunks[-1] = reversed_chunks[-1][space_left:] if self.cjk: chunk_start, chunk_end = cjkslices(reversed_chunks[-1], space_left) cur_line.append(chunk_start) reversed_chunks[-1] = chunk_end else: cur_line.append(reversed_chunks[-1][:space_left]) reversed_chunks[-1] = reversed_chunks[-1][space_left:]
# Otherwise, we have to preserve the long word intact. Only add # it to the current line if there's nothing already there -- Expand Down Expand Up @@ -246,6 +257,9 @@ def _wrap_chunks(self, chunks): lines = [] if self.width <= 0: raise ValueError("invalid width %r (must be > 0)" % self.width) elif self.width == 1 and (sum(self.len(chunk) for chunk in chunks) > sum(len(chunk) for chunk in chunks)): raise ValueError("invalid width 1 (must be > 1 when CJK chars)") if self.max_lines is not None: if self.max_lines > 1: indent = self.subsequent_indent Expand Down Expand Up @@ -280,7 +294,7 @@ def _wrap_chunks(self, chunks): del chunks[-1]
while chunks: l = len(chunks[-1]) l = self.len(chunks[-1])
# Can at least squeeze this chunk onto the current line. if cur_len + l <= width: Expand All @@ -293,7 +307,7 @@ def _wrap_chunks(self, chunks):
# The current line is full, and the next chunk is too big to # fit on *any* line (not just this one). if chunks and len(chunks[-1]) > width: if chunks and self.len(chunks[-1]) > width: self._handle_long_word(chunks, cur_line, cur_len, width) cur_len = sum(map(len, cur_line))
Expand Down Expand Up @@ -365,7 +379,7 @@ def fill(self, text):
# -- Convenience interface ---------------------------------------------
def wrap(text, width=70, **kwargs): def wrap(text, width=70, cjk=False, **kwargs): """Wrap a single paragraph of text, returning a list of wrapped lines.
Reformat the single paragraph in 'text' so it fits in lines of no Expand All @@ -375,10 +389,10 @@ def wrap(text, width=70, **kwargs): space. See TextWrapper class for available keyword args to customize wrapping behaviour. """ w = TextWrapper(width=width, **kwargs) w = TextWrapper(width=width, cjk=cjk, **kwargs) return w.wrap(text)
def fill(text, width=70, **kwargs): def fill(text, width=70, cjk=False, **kwargs): """Fill a single paragraph of text, returning a new string.
Reformat the single paragraph in 'text' to fit in lines of no more Expand All @@ -387,10 +401,10 @@ def fill(text, width=70, **kwargs): whitespace characters converted to space. See TextWrapper class for available keyword args to customize wrapping behaviour. """ w = TextWrapper(width=width, **kwargs) w = TextWrapper(width=width, cjk=cjk, **kwargs) return w.fill(text)
def shorten(text, width, **kwargs): def shorten(text, width, cjk=False, **kwargs): """Collapse and truncate the given text to fit in the given width.
The text first has its whitespace collapsed. If it then fits in Expand All @@ -402,10 +416,41 @@ def shorten(text, width, **kwargs): >>> textwrap.shorten("Hello world!", width=11) 'Hello [...]' """ w = TextWrapper(width=width, max_lines=1, **kwargs) w = TextWrapper(width=width, cjk=cjk, max_lines=1, **kwargs) return w.fill(' '.join(text.strip().split()))

# -- CJK support ------------------------------------------------------
def cjkwide(char): """Return True if char is Fullwidth or Wide, False otherwise. Fullwidth and Wide CJK chars are double-width. """ return unicodedata.east_asian_width(char) in ('F', 'W')

def cjklen(text): """Return the real width of text (its len if not a string). """ if not isinstance(text, str): return len(text) return sum(2 if cjkwide(char) else 1 for char in text)

def cjkslices(text, index): """Return the two slices of text cut to the index. """ if not isinstance(text, str): return text[:index], text[index:] if cjklen(text) <= index: return text, '' i = 1 # <= and i-1 to catch the last double length char of odd line while cjklen(text[:i]) <= index: i = i + 1 return text[:i-1], text[i-1:]

# -- Loosely related functionality -------------------------------------
_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE) Expand Down