| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283 |
- """
- Test the implementation of the PEP 540: the UTF-8 Mode.
- """
- import locale
- import subprocess
- import sys
- import textwrap
- import unittest
- from test import support
- from test.support.script_helper import assert_python_ok, assert_python_failure
- from test.support import os_helper
- MS_WINDOWS = (sys.platform == 'win32')
- POSIX_LOCALES = ('C', 'POSIX')
- VXWORKS = (sys.platform == "vxworks")
- class UTF8ModeTests(unittest.TestCase):
- DEFAULT_ENV = {
- 'PYTHONUTF8': '',
- 'PYTHONLEGACYWINDOWSFSENCODING': '',
- 'PYTHONCOERCECLOCALE': '0',
- }
- def posix_locale(self):
- loc = locale.setlocale(locale.LC_CTYPE, None)
- return (loc in POSIX_LOCALES)
- def get_output(self, *args, failure=False, **kw):
- kw = dict(self.DEFAULT_ENV, **kw)
- if failure:
- out = assert_python_failure(*args, **kw)
- out = out[2]
- else:
- out = assert_python_ok(*args, **kw)
- out = out[1]
- return out.decode().rstrip("\n\r")
- @unittest.skipIf(MS_WINDOWS, 'Windows has no POSIX locale')
- def test_posix_locale(self):
- code = 'import sys; print(sys.flags.utf8_mode)'
- for loc in POSIX_LOCALES:
- with self.subTest(LC_ALL=loc):
- out = self.get_output('-c', code, LC_ALL=loc)
- self.assertEqual(out, '1')
- def test_xoption(self):
- code = 'import sys; print(sys.flags.utf8_mode)'
- out = self.get_output('-X', 'utf8', '-c', code)
- self.assertEqual(out, '1')
- # undocumented but accepted syntax: -X utf8=1
- out = self.get_output('-X', 'utf8=1', '-c', code)
- self.assertEqual(out, '1')
- out = self.get_output('-X', 'utf8=0', '-c', code)
- self.assertEqual(out, '0')
- if MS_WINDOWS:
- # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 Mode
- # and has the priority over -X utf8
- out = self.get_output('-X', 'utf8', '-c', code,
- PYTHONLEGACYWINDOWSFSENCODING='1')
- self.assertEqual(out, '0')
- def test_env_var(self):
- code = 'import sys; print(sys.flags.utf8_mode)'
- out = self.get_output('-c', code, PYTHONUTF8='1')
- self.assertEqual(out, '1')
- out = self.get_output('-c', code, PYTHONUTF8='0')
- self.assertEqual(out, '0')
- # -X utf8 has the priority over PYTHONUTF8
- out = self.get_output('-X', 'utf8=0', '-c', code, PYTHONUTF8='1')
- self.assertEqual(out, '0')
- if MS_WINDOWS:
- # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
- # and has the priority over PYTHONUTF8
- out = self.get_output('-X', 'utf8', '-c', code, PYTHONUTF8='1',
- PYTHONLEGACYWINDOWSFSENCODING='1')
- self.assertEqual(out, '0')
- # Cannot test with the POSIX locale, since the POSIX locale enables
- # the UTF-8 mode
- if not self.posix_locale():
- # PYTHONUTF8 should be ignored if -E is used
- out = self.get_output('-E', '-c', code, PYTHONUTF8='1')
- self.assertEqual(out, '0')
- # invalid mode
- out = self.get_output('-c', code, PYTHONUTF8='xxx', failure=True)
- self.assertIn('invalid PYTHONUTF8 environment variable value',
- out.rstrip())
- def test_filesystemencoding(self):
- code = textwrap.dedent('''
- import sys
- print("{}/{}".format(sys.getfilesystemencoding(),
- sys.getfilesystemencodeerrors()))
- ''')
- if MS_WINDOWS:
- expected = 'utf-8/surrogatepass'
- else:
- expected = 'utf-8/surrogateescape'
- out = self.get_output('-X', 'utf8', '-c', code)
- self.assertEqual(out, expected)
- if MS_WINDOWS:
- # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
- # and has the priority over -X utf8 and PYTHONUTF8
- out = self.get_output('-X', 'utf8', '-c', code,
- PYTHONUTF8='strict',
- PYTHONLEGACYWINDOWSFSENCODING='1')
- self.assertEqual(out, 'mbcs/replace')
- def test_stdio(self):
- code = textwrap.dedent('''
- import sys
- print(f"stdin: {sys.stdin.encoding}/{sys.stdin.errors}")
- print(f"stdout: {sys.stdout.encoding}/{sys.stdout.errors}")
- print(f"stderr: {sys.stderr.encoding}/{sys.stderr.errors}")
- ''')
- out = self.get_output('-X', 'utf8', '-c', code,
- PYTHONIOENCODING='')
- self.assertEqual(out.splitlines(),
- ['stdin: utf-8/surrogateescape',
- 'stdout: utf-8/surrogateescape',
- 'stderr: utf-8/backslashreplace'])
- # PYTHONIOENCODING has the priority over PYTHONUTF8
- out = self.get_output('-X', 'utf8', '-c', code,
- PYTHONIOENCODING="latin1")
- self.assertEqual(out.splitlines(),
- ['stdin: iso8859-1/strict',
- 'stdout: iso8859-1/strict',
- 'stderr: iso8859-1/backslashreplace'])
- out = self.get_output('-X', 'utf8', '-c', code,
- PYTHONIOENCODING=":namereplace")
- self.assertEqual(out.splitlines(),
- ['stdin: utf-8/namereplace',
- 'stdout: utf-8/namereplace',
- 'stderr: utf-8/backslashreplace'])
- def test_io(self):
- code = textwrap.dedent('''
- import sys
- filename = sys.argv[1]
- with open(filename) as fp:
- print(f"{fp.encoding}/{fp.errors}")
- ''')
- filename = __file__
- out = self.get_output('-c', code, filename, PYTHONUTF8='1')
- self.assertEqual(out.lower(), 'utf-8/strict')
- def _check_io_encoding(self, module, encoding=None, errors=None):
- filename = __file__
- # Encoding explicitly set
- args = []
- if encoding:
- args.append(f'encoding={encoding!r}')
- if errors:
- args.append(f'errors={errors!r}')
- code = textwrap.dedent('''
- import sys
- from %s import open
- filename = sys.argv[1]
- with open(filename, %s) as fp:
- print(f"{fp.encoding}/{fp.errors}")
- ''') % (module, ', '.join(args))
- out = self.get_output('-c', code, filename,
- PYTHONUTF8='1')
- if not encoding:
- encoding = 'utf-8'
- if not errors:
- errors = 'strict'
- self.assertEqual(out.lower(), f'{encoding}/{errors}')
- def check_io_encoding(self, module):
- self._check_io_encoding(module, encoding="latin1")
- self._check_io_encoding(module, errors="namereplace")
- self._check_io_encoding(module,
- encoding="latin1", errors="namereplace")
- def test_io_encoding(self):
- self.check_io_encoding('io')
- def test_pyio_encoding(self):
- self.check_io_encoding('_pyio')
- def test_locale_getpreferredencoding(self):
- code = 'import locale; print(locale.getpreferredencoding(False), locale.getpreferredencoding(True))'
- out = self.get_output('-X', 'utf8', '-c', code)
- self.assertEqual(out, 'utf-8 utf-8')
- for loc in POSIX_LOCALES:
- with self.subTest(LC_ALL=loc):
- out = self.get_output('-X', 'utf8', '-c', code, LC_ALL=loc)
- self.assertEqual(out, 'utf-8 utf-8')
- @unittest.skipIf(MS_WINDOWS, 'test specific to Unix')
- def test_cmd_line(self):
- arg = 'h\xe9\u20ac'.encode('utf-8')
- arg_utf8 = arg.decode('utf-8')
- arg_ascii = arg.decode('ascii', 'surrogateescape')
- code = 'import locale, sys; print("%s:%s" % (locale.getpreferredencoding(), ascii(sys.argv[1:])))'
- def check(utf8_opt, expected, **kw):
- out = self.get_output('-X', utf8_opt, '-c', code, arg, **kw)
- args = out.partition(':')[2].rstrip()
- self.assertEqual(args, ascii(expected), out)
- check('utf8', [arg_utf8])
- for loc in POSIX_LOCALES:
- with self.subTest(LC_ALL=loc):
- check('utf8', [arg_utf8], LC_ALL=loc)
- if sys.platform == 'darwin' or support.is_android or VXWORKS:
- c_arg = arg_utf8
- elif sys.platform.startswith("aix"):
- c_arg = arg.decode('iso-8859-1')
- else:
- c_arg = arg_ascii
- for loc in POSIX_LOCALES:
- with self.subTest(LC_ALL=loc):
- check('utf8=0', [c_arg], LC_ALL=loc)
- def test_optim_level(self):
- # CPython: check that Py_Main() doesn't increment Py_OptimizeFlag
- # twice when -X utf8 requires to parse the configuration twice (when
- # the encoding changes after reading the configuration, the
- # configuration is read again with the new encoding).
- code = 'import sys; print(sys.flags.optimize)'
- out = self.get_output('-X', 'utf8', '-O', '-c', code)
- self.assertEqual(out, '1')
- out = self.get_output('-X', 'utf8', '-OO', '-c', code)
- self.assertEqual(out, '2')
- code = 'import sys; print(sys.flags.ignore_environment)'
- out = self.get_output('-X', 'utf8', '-E', '-c', code)
- self.assertEqual(out, '1')
- @unittest.skipIf(MS_WINDOWS,
- "os.device_encoding() doesn't implement "
- "the UTF-8 Mode on Windows")
- @support.requires_subprocess()
- def test_device_encoding(self):
- # Use stdout as TTY
- if not sys.stdout.isatty():
- self.skipTest("sys.stdout is not a TTY")
- filename = 'out.txt'
- self.addCleanup(os_helper.unlink, filename)
- code = (f'import os, sys; fd = sys.stdout.fileno(); '
- f'out = open({filename!r}, "w", encoding="utf-8"); '
- f'print(os.isatty(fd), os.device_encoding(fd), file=out); '
- f'out.close()')
- cmd = [sys.executable, '-X', 'utf8', '-c', code]
- # The stdout TTY is inherited to the child process
- proc = subprocess.run(cmd, text=True)
- self.assertEqual(proc.returncode, 0, proc)
- # In UTF-8 Mode, device_encoding(fd) returns "UTF-8" if fd is a TTY
- with open(filename, encoding="utf8") as fp:
- out = fp.read().rstrip()
- self.assertEqual(out, 'True utf-8')
- if __name__ == "__main__":
- unittest.main()
|