Test failures with Python 3.12.0b1

Overview Description

The test suite fails when run with Python 3.12.0b1:

FAILED tests/messages/test_extract.py::ExtractPythonTestCase::test_utf8_message_with_utf8_bom -   File "<string>", line 1
FAILED tests/messages/test_extract.py::ExtractPythonTestCase::test_utf8_message_with_utf8_bom_and_magic_comment -   File "<string>", line 1
FAILED tests/messages/test_extract.py::ExtractPythonTestCase::test_utf8_raw_strings_match_unicode_strings -   File "<string>", line 1
FAILED tests/messages/test_extract.py::ExtractTestCase::test_f_strings - AssertionError: assert 3 == 4
FAILED tests/messages/test_extract.py::ExtractTestCase::test_f_strings_non_utf8 - assert 0 == 1

Furthermore, tox -e py312 fails by default because of missing distutils module (installing setuptools can workaround that but distutils use should be removed altogether).

Steps to Reproduce

  1. tox -e py312

Actual Results

________________________________________ ExtractPythonTestCase.test_utf8_message_with_utf8_bom ________________________________________

self = <tests.messages.test_extract.ExtractPythonTestCase testMethod=test_utf8_message_with_utf8_bom>

        def test_utf8_message_with_utf8_bom(self):
            buf = BytesIO(codecs.BOM_UTF8 + """
    # NOTE: hello
    msg = _('Bonjour à tous')
    """.encode('utf-8'))
>           messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))

tests/messages/test_extract.py:367: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
babel/messages/extract.py:500: in extract_python
    for tok, value, (lineno, _), _, _ in tokens:
/usr/lib/python3.12/tokenize.py:451: in _tokenize
    for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

source = "\ufeff\n# NOTE: hello\nmsg = _('Bonjour à tous')\n", extra_tokens = True

    def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
        """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
        import _tokenize as c_tokenizer
>       for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
E         File "<string>", line 1
E           
E           ^
E       SyntaxError: invalid non-printable character U+FEFF

/usr/lib/python3.12/tokenize.py:542: SyntaxError
_______________________________ ExtractPythonTestCase.test_utf8_message_with_utf8_bom_and_magic_comment _______________________________

self = <tests.messages.test_extract.ExtractPythonTestCase testMethod=test_utf8_message_with_utf8_bom_and_magic_comment>

        def test_utf8_message_with_utf8_bom_and_magic_comment(self):
            buf = BytesIO(codecs.BOM_UTF8 + """# -*- coding: utf-8 -*-
    # NOTE: hello
    msg = _('Bonjour à tous')
    """.encode('utf-8'))
>           messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))

tests/messages/test_extract.py:376: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
babel/messages/extract.py:500: in extract_python
    for tok, value, (lineno, _), _, _ in tokens:
/usr/lib/python3.12/tokenize.py:451: in _tokenize
    for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

source = "\ufeff# -*- coding: utf-8 -*-\n# NOTE: hello\nmsg = _('Bonjour à tous')\n", extra_tokens = True

    def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
        """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
        import _tokenize as c_tokenizer
>       for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
E         File "<string>", line 1
E           # -*- coding: utf-8 -*-
E           ^
E       SyntaxError: invalid non-printable character U+FEFF

/usr/lib/python3.12/tokenize.py:542: SyntaxError
__________________________________ ExtractPythonTestCase.test_utf8_raw_strings_match_unicode_strings __________________________________

self = <tests.messages.test_extract.ExtractPythonTestCase testMethod=test_utf8_raw_strings_match_unicode_strings>

        def test_utf8_raw_strings_match_unicode_strings(self):
            buf = BytesIO(codecs.BOM_UTF8 + """
    msg = _('Bonjour à tous')
    msgu = _(u'Bonjour à tous')
    """.encode('utf-8'))
>           messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))

tests/messages/test_extract.py:393: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
babel/messages/extract.py:500: in extract_python
    for tok, value, (lineno, _), _, _ in tokens:
/usr/lib/python3.12/tokenize.py:451: in _tokenize
    for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

source = "\ufeff\nmsg = _('Bonjour à tous')\nmsgu = _(u'Bonjour à tous')\n", extra_tokens = True

    def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
        """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
        import _tokenize as c_tokenizer
>       for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
E         File "<string>", line 1
E           
E           ^
E       SyntaxError: invalid non-printable character U+FEFF

/usr/lib/python3.12/tokenize.py:542: SyntaxError
___________________________________________________ ExtractTestCase.test_f_strings ____________________________________________________

self = <tests.messages.test_extract.ExtractTestCase testMethod=test_f_strings>

        def test_f_strings(self):
            buf = BytesIO(br"""
    t1 = _('foobar')
    t2 = _(f'spameggs' f'feast')  # should be extracted; constant parts only
    t2 = _(f'spameggs' 'kerroshampurilainen')  # should be extracted (mixing f with no f)
    t3 = _(f'''whoa! a '''  # should be extracted (continues on following lines)
    f'flying shark'
        '... hello'
    )
    t4 = _(f'spameggs {t1}')  # should not be extracted
    """)
            messages = list(extract.extract('python', buf, extract.DEFAULT_KEYWORDS, [], {}))
>           assert len(messages) == 4
E           AssertionError: assert 3 == 4
E            +  where 3 = len([(2, 'foobar', [], None), (4, 'kerroshampurilainen', [], None), (5, '... hello', [], None)])

tests/messages/test_extract.py:544: AssertionError
_______________________________________________ ExtractTestCase.test_f_strings_non_utf8 _______________________________________________

self = <tests.messages.test_extract.ExtractTestCase testMethod=test_f_strings_non_utf8>

        def test_f_strings_non_utf8(self):
            buf = BytesIO(b"""
    # -- coding: latin-1 --
    t2 = _(f'\xe5\xe4\xf6' f'\xc5\xc4\xd6')
    """)
            messages = list(extract.extract('python', buf, extract.DEFAULT_KEYWORDS, [], {}))
>           assert len(messages) == 1
E           assert 0 == 1
E            +  where 0 = len([])

tests/messages/test_extract.py:556: AssertionError

Expected Results

Passing tests (or at least passing as well as py3.11 did).

Reproducibility

Always.

Additional Information

Confirmed with git 8b152db.