test_utf8_mode.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. """
  2. Test the implementation of the PEP 540: the UTF-8 Mode.
  3. """
  4. import locale
  5. import subprocess
  6. import sys
  7. import textwrap
  8. import unittest
  9. from test import support
  10. from test.support.script_helper import assert_python_ok, assert_python_failure
  11. from test.support import os_helper
  12. MS_WINDOWS = (sys.platform == 'win32')
  13. POSIX_LOCALES = ('C', 'POSIX')
  14. VXWORKS = (sys.platform == "vxworks")
  15. class UTF8ModeTests(unittest.TestCase):
  16. DEFAULT_ENV = {
  17. 'PYTHONUTF8': '',
  18. 'PYTHONLEGACYWINDOWSFSENCODING': '',
  19. 'PYTHONCOERCECLOCALE': '0',
  20. }
  21. def posix_locale(self):
  22. loc = locale.setlocale(locale.LC_CTYPE, None)
  23. return (loc in POSIX_LOCALES)
  24. def get_output(self, *args, failure=False, **kw):
  25. kw = dict(self.DEFAULT_ENV, **kw)
  26. if failure:
  27. out = assert_python_failure(*args, **kw)
  28. out = out[2]
  29. else:
  30. out = assert_python_ok(*args, **kw)
  31. out = out[1]
  32. return out.decode().rstrip("\n\r")
  33. @unittest.skipIf(MS_WINDOWS, 'Windows has no POSIX locale')
  34. def test_posix_locale(self):
  35. code = 'import sys; print(sys.flags.utf8_mode)'
  36. for loc in POSIX_LOCALES:
  37. with self.subTest(LC_ALL=loc):
  38. out = self.get_output('-c', code, LC_ALL=loc)
  39. self.assertEqual(out, '1')
  40. def test_xoption(self):
  41. code = 'import sys; print(sys.flags.utf8_mode)'
  42. out = self.get_output('-X', 'utf8', '-c', code)
  43. self.assertEqual(out, '1')
  44. # undocumented but accepted syntax: -X utf8=1
  45. out = self.get_output('-X', 'utf8=1', '-c', code)
  46. self.assertEqual(out, '1')
  47. out = self.get_output('-X', 'utf8=0', '-c', code)
  48. self.assertEqual(out, '0')
  49. if MS_WINDOWS:
  50. # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 Mode
  51. # and has the priority over -X utf8
  52. out = self.get_output('-X', 'utf8', '-c', code,
  53. PYTHONLEGACYWINDOWSFSENCODING='1')
  54. self.assertEqual(out, '0')
  55. def test_env_var(self):
  56. code = 'import sys; print(sys.flags.utf8_mode)'
  57. out = self.get_output('-c', code, PYTHONUTF8='1')
  58. self.assertEqual(out, '1')
  59. out = self.get_output('-c', code, PYTHONUTF8='0')
  60. self.assertEqual(out, '0')
  61. # -X utf8 has the priority over PYTHONUTF8
  62. out = self.get_output('-X', 'utf8=0', '-c', code, PYTHONUTF8='1')
  63. self.assertEqual(out, '0')
  64. if MS_WINDOWS:
  65. # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
  66. # and has the priority over PYTHONUTF8
  67. out = self.get_output('-X', 'utf8', '-c', code, PYTHONUTF8='1',
  68. PYTHONLEGACYWINDOWSFSENCODING='1')
  69. self.assertEqual(out, '0')
  70. # Cannot test with the POSIX locale, since the POSIX locale enables
  71. # the UTF-8 mode
  72. if not self.posix_locale():
  73. # PYTHONUTF8 should be ignored if -E is used
  74. out = self.get_output('-E', '-c', code, PYTHONUTF8='1')
  75. self.assertEqual(out, '0')
  76. # invalid mode
  77. out = self.get_output('-c', code, PYTHONUTF8='xxx', failure=True)
  78. self.assertIn('invalid PYTHONUTF8 environment variable value',
  79. out.rstrip())
  80. def test_filesystemencoding(self):
  81. code = textwrap.dedent('''
  82. import sys
  83. print("{}/{}".format(sys.getfilesystemencoding(),
  84. sys.getfilesystemencodeerrors()))
  85. ''')
  86. if MS_WINDOWS:
  87. expected = 'utf-8/surrogatepass'
  88. else:
  89. expected = 'utf-8/surrogateescape'
  90. out = self.get_output('-X', 'utf8', '-c', code)
  91. self.assertEqual(out, expected)
  92. if MS_WINDOWS:
  93. # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
  94. # and has the priority over -X utf8 and PYTHONUTF8
  95. out = self.get_output('-X', 'utf8', '-c', code,
  96. PYTHONUTF8='strict',
  97. PYTHONLEGACYWINDOWSFSENCODING='1')
  98. self.assertEqual(out, 'mbcs/replace')
  99. def test_stdio(self):
  100. code = textwrap.dedent('''
  101. import sys
  102. print(f"stdin: {sys.stdin.encoding}/{sys.stdin.errors}")
  103. print(f"stdout: {sys.stdout.encoding}/{sys.stdout.errors}")
  104. print(f"stderr: {sys.stderr.encoding}/{sys.stderr.errors}")
  105. ''')
  106. out = self.get_output('-X', 'utf8', '-c', code,
  107. PYTHONIOENCODING='')
  108. self.assertEqual(out.splitlines(),
  109. ['stdin: utf-8/surrogateescape',
  110. 'stdout: utf-8/surrogateescape',
  111. 'stderr: utf-8/backslashreplace'])
  112. # PYTHONIOENCODING has the priority over PYTHONUTF8
  113. out = self.get_output('-X', 'utf8', '-c', code,
  114. PYTHONIOENCODING="latin1")
  115. self.assertEqual(out.splitlines(),
  116. ['stdin: iso8859-1/strict',
  117. 'stdout: iso8859-1/strict',
  118. 'stderr: iso8859-1/backslashreplace'])
  119. out = self.get_output('-X', 'utf8', '-c', code,
  120. PYTHONIOENCODING=":namereplace")
  121. self.assertEqual(out.splitlines(),
  122. ['stdin: utf-8/namereplace',
  123. 'stdout: utf-8/namereplace',
  124. 'stderr: utf-8/backslashreplace'])
  125. def test_io(self):
  126. code = textwrap.dedent('''
  127. import sys
  128. filename = sys.argv[1]
  129. with open(filename) as fp:
  130. print(f"{fp.encoding}/{fp.errors}")
  131. ''')
  132. filename = __file__
  133. out = self.get_output('-c', code, filename, PYTHONUTF8='1')
  134. self.assertEqual(out.lower(), 'utf-8/strict')
  135. def _check_io_encoding(self, module, encoding=None, errors=None):
  136. filename = __file__
  137. # Encoding explicitly set
  138. args = []
  139. if encoding:
  140. args.append(f'encoding={encoding!r}')
  141. if errors:
  142. args.append(f'errors={errors!r}')
  143. code = textwrap.dedent('''
  144. import sys
  145. from %s import open
  146. filename = sys.argv[1]
  147. with open(filename, %s) as fp:
  148. print(f"{fp.encoding}/{fp.errors}")
  149. ''') % (module, ', '.join(args))
  150. out = self.get_output('-c', code, filename,
  151. PYTHONUTF8='1')
  152. if not encoding:
  153. encoding = 'utf-8'
  154. if not errors:
  155. errors = 'strict'
  156. self.assertEqual(out.lower(), f'{encoding}/{errors}')
  157. def check_io_encoding(self, module):
  158. self._check_io_encoding(module, encoding="latin1")
  159. self._check_io_encoding(module, errors="namereplace")
  160. self._check_io_encoding(module,
  161. encoding="latin1", errors="namereplace")
  162. def test_io_encoding(self):
  163. self.check_io_encoding('io')
  164. def test_pyio_encoding(self):
  165. self.check_io_encoding('_pyio')
  166. def test_locale_getpreferredencoding(self):
  167. code = 'import locale; print(locale.getpreferredencoding(False), locale.getpreferredencoding(True))'
  168. out = self.get_output('-X', 'utf8', '-c', code)
  169. self.assertEqual(out, 'utf-8 utf-8')
  170. for loc in POSIX_LOCALES:
  171. with self.subTest(LC_ALL=loc):
  172. out = self.get_output('-X', 'utf8', '-c', code, LC_ALL=loc)
  173. self.assertEqual(out, 'utf-8 utf-8')
  174. @unittest.skipIf(MS_WINDOWS, 'test specific to Unix')
  175. def test_cmd_line(self):
  176. arg = 'h\xe9\u20ac'.encode('utf-8')
  177. arg_utf8 = arg.decode('utf-8')
  178. arg_ascii = arg.decode('ascii', 'surrogateescape')
  179. code = 'import locale, sys; print("%s:%s" % (locale.getpreferredencoding(), ascii(sys.argv[1:])))'
  180. def check(utf8_opt, expected, **kw):
  181. out = self.get_output('-X', utf8_opt, '-c', code, arg, **kw)
  182. args = out.partition(':')[2].rstrip()
  183. self.assertEqual(args, ascii(expected), out)
  184. check('utf8', [arg_utf8])
  185. for loc in POSIX_LOCALES:
  186. with self.subTest(LC_ALL=loc):
  187. check('utf8', [arg_utf8], LC_ALL=loc)
  188. if sys.platform == 'darwin' or support.is_android or VXWORKS:
  189. c_arg = arg_utf8
  190. elif sys.platform.startswith("aix"):
  191. c_arg = arg.decode('iso-8859-1')
  192. else:
  193. c_arg = arg_ascii
  194. for loc in POSIX_LOCALES:
  195. with self.subTest(LC_ALL=loc):
  196. check('utf8=0', [c_arg], LC_ALL=loc)
  197. def test_optim_level(self):
  198. # CPython: check that Py_Main() doesn't increment Py_OptimizeFlag
  199. # twice when -X utf8 requires to parse the configuration twice (when
  200. # the encoding changes after reading the configuration, the
  201. # configuration is read again with the new encoding).
  202. code = 'import sys; print(sys.flags.optimize)'
  203. out = self.get_output('-X', 'utf8', '-O', '-c', code)
  204. self.assertEqual(out, '1')
  205. out = self.get_output('-X', 'utf8', '-OO', '-c', code)
  206. self.assertEqual(out, '2')
  207. code = 'import sys; print(sys.flags.ignore_environment)'
  208. out = self.get_output('-X', 'utf8', '-E', '-c', code)
  209. self.assertEqual(out, '1')
  210. @unittest.skipIf(MS_WINDOWS,
  211. "os.device_encoding() doesn't implement "
  212. "the UTF-8 Mode on Windows")
  213. @support.requires_subprocess()
  214. def test_device_encoding(self):
  215. # Use stdout as TTY
  216. if not sys.stdout.isatty():
  217. self.skipTest("sys.stdout is not a TTY")
  218. filename = 'out.txt'
  219. self.addCleanup(os_helper.unlink, filename)
  220. code = (f'import os, sys; fd = sys.stdout.fileno(); '
  221. f'out = open({filename!r}, "w", encoding="utf-8"); '
  222. f'print(os.isatty(fd), os.device_encoding(fd), file=out); '
  223. f'out.close()')
  224. cmd = [sys.executable, '-X', 'utf8', '-c', code]
  225. # The stdout TTY is inherited to the child process
  226. proc = subprocess.run(cmd, text=True)
  227. self.assertEqual(proc.returncode, 0, proc)
  228. # In UTF-8 Mode, device_encoding(fd) returns "UTF-8" if fd is a TTY
  229. with open(filename, encoding="utf8") as fp:
  230. out = fp.read().rstrip()
  231. self.assertEqual(out, 'True utf-8')
  232. if __name__ == "__main__":
  233. unittest.main()