test_codeccallbacks.py 47 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240
  1. import codecs
  2. import html.entities
  3. import itertools
  4. import sys
  5. import unicodedata
  6. import unittest
  7. class PosReturn:
  8. # this can be used for configurable callbacks
  9. def __init__(self):
  10. self.pos = 0
  11. def handle(self, exc):
  12. oldpos = self.pos
  13. realpos = oldpos
  14. if realpos<0:
  15. realpos = len(exc.object) + realpos
  16. # if we don't advance this time, terminate on the next call
  17. # otherwise we'd get an endless loop
  18. if realpos <= exc.start:
  19. self.pos = len(exc.object)
  20. return ("<?>", oldpos)
  21. class RepeatedPosReturn:
  22. def __init__(self, repl="<?>"):
  23. self.repl = repl
  24. self.pos = 0
  25. self.count = 0
  26. def handle(self, exc):
  27. if self.count > 0:
  28. self.count -= 1
  29. return (self.repl, self.pos)
  30. return (self.repl, exc.end)
  31. # A UnicodeEncodeError object with a bad start attribute
  32. class BadStartUnicodeEncodeError(UnicodeEncodeError):
  33. def __init__(self):
  34. UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
  35. self.start = []
  36. # A UnicodeEncodeError object with a bad object attribute
  37. class BadObjectUnicodeEncodeError(UnicodeEncodeError):
  38. def __init__(self):
  39. UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
  40. self.object = []
  41. # A UnicodeDecodeError object without an end attribute
  42. class NoEndUnicodeDecodeError(UnicodeDecodeError):
  43. def __init__(self):
  44. UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
  45. del self.end
  46. # A UnicodeDecodeError object with a bad object attribute
  47. class BadObjectUnicodeDecodeError(UnicodeDecodeError):
  48. def __init__(self):
  49. UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
  50. self.object = []
  51. # A UnicodeTranslateError object without a start attribute
  52. class NoStartUnicodeTranslateError(UnicodeTranslateError):
  53. def __init__(self):
  54. UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
  55. del self.start
  56. # A UnicodeTranslateError object without an end attribute
  57. class NoEndUnicodeTranslateError(UnicodeTranslateError):
  58. def __init__(self):
  59. UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
  60. del self.end
  61. # A UnicodeTranslateError object without an object attribute
  62. class NoObjectUnicodeTranslateError(UnicodeTranslateError):
  63. def __init__(self):
  64. UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
  65. del self.object
  66. class CodecCallbackTest(unittest.TestCase):
  67. def test_xmlcharrefreplace(self):
  68. # replace unencodable characters which numeric character entities.
  69. # For ascii, latin-1 and charmaps this is completely implemented
  70. # in C and should be reasonably fast.
  71. s = "\u30b9\u30d1\u30e2 \xe4nd eggs"
  72. self.assertEqual(
  73. s.encode("ascii", "xmlcharrefreplace"),
  74. b"&#12473;&#12497;&#12514; &#228;nd eggs"
  75. )
  76. self.assertEqual(
  77. s.encode("latin-1", "xmlcharrefreplace"),
  78. b"&#12473;&#12497;&#12514; \xe4nd eggs"
  79. )
  80. def test_xmlcharnamereplace(self):
  81. # This time use a named character entity for unencodable
  82. # characters, if one is available.
  83. def xmlcharnamereplace(exc):
  84. if not isinstance(exc, UnicodeEncodeError):
  85. raise TypeError("don't know how to handle %r" % exc)
  86. l = []
  87. for c in exc.object[exc.start:exc.end]:
  88. try:
  89. l.append("&%s;" % html.entities.codepoint2name[ord(c)])
  90. except KeyError:
  91. l.append("&#%d;" % ord(c))
  92. return ("".join(l), exc.end)
  93. codecs.register_error(
  94. "test.xmlcharnamereplace", xmlcharnamereplace)
  95. sin = "\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
  96. sout = b"&laquo;&real;&raquo; = &lang;&#4660;&euro;&rang;"
  97. self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout)
  98. sout = b"\xab&real;\xbb = &lang;&#4660;&euro;&rang;"
  99. self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout)
  100. sout = b"\xab&real;\xbb = &lang;&#4660;\xa4&rang;"
  101. self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout)
  102. def test_uninamereplace(self):
  103. # We're using the names from the unicode database this time,
  104. # and we're doing "syntax highlighting" here, i.e. we include
  105. # the replaced text in ANSI escape sequences. For this it is
  106. # useful that the error handler is not called for every single
  107. # unencodable character, but for a complete sequence of
  108. # unencodable characters, otherwise we would output many
  109. # unnecessary escape sequences.
  110. def uninamereplace(exc):
  111. if not isinstance(exc, UnicodeEncodeError):
  112. raise TypeError("don't know how to handle %r" % exc)
  113. l = []
  114. for c in exc.object[exc.start:exc.end]:
  115. l.append(unicodedata.name(c, "0x%x" % ord(c)))
  116. return ("\033[1m%s\033[0m" % ", ".join(l), exc.end)
  117. codecs.register_error(
  118. "test.uninamereplace", uninamereplace)
  119. sin = "\xac\u1234\u20ac\u8000"
  120. sout = b"\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
  121. self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout)
  122. sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
  123. self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout)
  124. sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m"
  125. self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout)
  126. def test_backslashescape(self):
  127. # Does the same as the "unicode-escape" encoding, but with different
  128. # base encodings.
  129. sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
  130. sout = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
  131. self.assertEqual(sin.encode("ascii", "backslashreplace"), sout)
  132. sout = b"a\xac\\u1234\\u20ac\\u8000\\U0010ffff"
  133. self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout)
  134. sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff"
  135. self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
  136. def test_nameescape(self):
  137. # Does the same as backslashescape, but prefers ``\N{...}`` escape
  138. # sequences.
  139. sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
  140. sout = (b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
  141. b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
  142. self.assertEqual(sin.encode("ascii", "namereplace"), sout)
  143. sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
  144. b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
  145. self.assertEqual(sin.encode("latin-1", "namereplace"), sout)
  146. sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\xa4'
  147. b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
  148. self.assertEqual(sin.encode("iso-8859-15", "namereplace"), sout)
  149. def test_decoding_callbacks(self):
  150. # This is a test for a decoding callback handler
  151. # that allows the decoding of the invalid sequence
  152. # "\xc0\x80" and returns "\x00" instead of raising an error.
  153. # All other illegal sequences will be handled strictly.
  154. def relaxedutf8(exc):
  155. if not isinstance(exc, UnicodeDecodeError):
  156. raise TypeError("don't know how to handle %r" % exc)
  157. if exc.object[exc.start:exc.start+2] == b"\xc0\x80":
  158. return ("\x00", exc.start+2) # retry after two bytes
  159. else:
  160. raise exc
  161. codecs.register_error("test.relaxedutf8", relaxedutf8)
  162. # all the "\xc0\x80" will be decoded to "\x00"
  163. sin = b"a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
  164. sout = "a\x00b\x00c\xfc\x00\x00"
  165. self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout)
  166. # "\xc0\x81" is not valid and a UnicodeDecodeError will be raised
  167. sin = b"\xc0\x80\xc0\x81"
  168. self.assertRaises(UnicodeDecodeError, sin.decode,
  169. "utf-8", "test.relaxedutf8")
  170. def test_charmapencode(self):
  171. # For charmap encodings the replacement string will be
  172. # mapped through the encoding again. This means, that
  173. # to be able to use e.g. the "replace" handler, the
  174. # charmap has to have a mapping for "?".
  175. charmap = dict((ord(c), bytes(2*c.upper(), 'ascii')) for c in "abcdefgh")
  176. sin = "abc"
  177. sout = b"AABBCC"
  178. self.assertEqual(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
  179. sin = "abcA"
  180. self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
  181. charmap[ord("?")] = b"XYZ"
  182. sin = "abcDEF"
  183. sout = b"AABBCCXYZXYZXYZ"
  184. self.assertEqual(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
  185. charmap[ord("?")] = "XYZ" # wrong type in mapping
  186. self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
  187. def test_callbacks(self):
  188. def handler1(exc):
  189. r = range(exc.start, exc.end)
  190. if isinstance(exc, UnicodeEncodeError):
  191. l = ["<%d>" % ord(exc.object[pos]) for pos in r]
  192. elif isinstance(exc, UnicodeDecodeError):
  193. l = ["<%d>" % exc.object[pos] for pos in r]
  194. else:
  195. raise TypeError("don't know how to handle %r" % exc)
  196. return ("[%s]" % "".join(l), exc.end)
  197. codecs.register_error("test.handler1", handler1)
  198. def handler2(exc):
  199. if not isinstance(exc, UnicodeDecodeError):
  200. raise TypeError("don't know how to handle %r" % exc)
  201. l = ["<%d>" % exc.object[pos] for pos in range(exc.start, exc.end)]
  202. return ("[%s]" % "".join(l), exc.end+1) # skip one character
  203. codecs.register_error("test.handler2", handler2)
  204. s = b"\x00\x81\x7f\x80\xff"
  205. self.assertEqual(
  206. s.decode("ascii", "test.handler1"),
  207. "\x00[<129>]\x7f[<128>][<255>]"
  208. )
  209. self.assertEqual(
  210. s.decode("ascii", "test.handler2"),
  211. "\x00[<129>][<128>]"
  212. )
  213. self.assertEqual(
  214. b"\\u3042\\u3xxx".decode("unicode-escape", "test.handler1"),
  215. "\u3042[<92><117><51>]xxx"
  216. )
  217. self.assertEqual(
  218. b"\\u3042\\u3xx".decode("unicode-escape", "test.handler1"),
  219. "\u3042[<92><117><51>]xx"
  220. )
  221. self.assertEqual(
  222. codecs.charmap_decode(b"abc", "test.handler1", {ord("a"): "z"})[0],
  223. "z[<98>][<99>]"
  224. )
  225. self.assertEqual(
  226. "g\xfc\xdfrk".encode("ascii", "test.handler1"),
  227. b"g[<252><223>]rk"
  228. )
  229. self.assertEqual(
  230. "g\xfc\xdf".encode("ascii", "test.handler1"),
  231. b"g[<252><223>]"
  232. )
  233. def test_longstrings(self):
  234. # test long strings to check for memory overflow problems
  235. errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
  236. "backslashreplace", "namereplace"]
  237. # register the handlers under different names,
  238. # to prevent the codec from recognizing the name
  239. for err in errors:
  240. codecs.register_error("test." + err, codecs.lookup_error(err))
  241. l = 1000
  242. errors += [ "test." + err for err in errors ]
  243. for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]:
  244. for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
  245. "utf-8", "utf-7", "utf-16", "utf-32"):
  246. for err in errors:
  247. try:
  248. uni.encode(enc, err)
  249. except UnicodeError:
  250. pass
  251. def check_exceptionobjectargs(self, exctype, args, msg):
  252. # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
  253. # check with one missing argument
  254. self.assertRaises(TypeError, exctype, *args[:-1])
  255. # check with one argument too much
  256. self.assertRaises(TypeError, exctype, *(args + ["too much"]))
  257. # check with one argument of the wrong type
  258. wrongargs = [ "spam", b"eggs", b"spam", 42, 1.0, None ]
  259. for i in range(len(args)):
  260. for wrongarg in wrongargs:
  261. if type(wrongarg) is type(args[i]):
  262. continue
  263. # build argument array
  264. callargs = []
  265. for j in range(len(args)):
  266. if i==j:
  267. callargs.append(wrongarg)
  268. else:
  269. callargs.append(args[i])
  270. self.assertRaises(TypeError, exctype, *callargs)
  271. # check with the correct number and type of arguments
  272. exc = exctype(*args)
  273. self.assertEqual(str(exc), msg)
  274. def test_unicodeencodeerror(self):
  275. self.check_exceptionobjectargs(
  276. UnicodeEncodeError,
  277. ["ascii", "g\xfcrk", 1, 2, "ouch"],
  278. "'ascii' codec can't encode character '\\xfc' in position 1: ouch"
  279. )
  280. self.check_exceptionobjectargs(
  281. UnicodeEncodeError,
  282. ["ascii", "g\xfcrk", 1, 4, "ouch"],
  283. "'ascii' codec can't encode characters in position 1-3: ouch"
  284. )
  285. self.check_exceptionobjectargs(
  286. UnicodeEncodeError,
  287. ["ascii", "\xfcx", 0, 1, "ouch"],
  288. "'ascii' codec can't encode character '\\xfc' in position 0: ouch"
  289. )
  290. self.check_exceptionobjectargs(
  291. UnicodeEncodeError,
  292. ["ascii", "\u0100x", 0, 1, "ouch"],
  293. "'ascii' codec can't encode character '\\u0100' in position 0: ouch"
  294. )
  295. self.check_exceptionobjectargs(
  296. UnicodeEncodeError,
  297. ["ascii", "\uffffx", 0, 1, "ouch"],
  298. "'ascii' codec can't encode character '\\uffff' in position 0: ouch"
  299. )
  300. self.check_exceptionobjectargs(
  301. UnicodeEncodeError,
  302. ["ascii", "\U00010000x", 0, 1, "ouch"],
  303. "'ascii' codec can't encode character '\\U00010000' in position 0: ouch"
  304. )
  305. def test_unicodedecodeerror(self):
  306. self.check_exceptionobjectargs(
  307. UnicodeDecodeError,
  308. ["ascii", bytearray(b"g\xfcrk"), 1, 2, "ouch"],
  309. "'ascii' codec can't decode byte 0xfc in position 1: ouch"
  310. )
  311. self.check_exceptionobjectargs(
  312. UnicodeDecodeError,
  313. ["ascii", bytearray(b"g\xfcrk"), 1, 3, "ouch"],
  314. "'ascii' codec can't decode bytes in position 1-2: ouch"
  315. )
  316. def test_unicodetranslateerror(self):
  317. self.check_exceptionobjectargs(
  318. UnicodeTranslateError,
  319. ["g\xfcrk", 1, 2, "ouch"],
  320. "can't translate character '\\xfc' in position 1: ouch"
  321. )
  322. self.check_exceptionobjectargs(
  323. UnicodeTranslateError,
  324. ["g\u0100rk", 1, 2, "ouch"],
  325. "can't translate character '\\u0100' in position 1: ouch"
  326. )
  327. self.check_exceptionobjectargs(
  328. UnicodeTranslateError,
  329. ["g\uffffrk", 1, 2, "ouch"],
  330. "can't translate character '\\uffff' in position 1: ouch"
  331. )
  332. self.check_exceptionobjectargs(
  333. UnicodeTranslateError,
  334. ["g\U00010000rk", 1, 2, "ouch"],
  335. "can't translate character '\\U00010000' in position 1: ouch"
  336. )
  337. self.check_exceptionobjectargs(
  338. UnicodeTranslateError,
  339. ["g\xfcrk", 1, 3, "ouch"],
  340. "can't translate characters in position 1-2: ouch"
  341. )
  342. def test_badandgoodstrictexceptions(self):
  343. # "strict" complains about a non-exception passed in
  344. self.assertRaises(
  345. TypeError,
  346. codecs.strict_errors,
  347. 42
  348. )
  349. # "strict" complains about the wrong exception type
  350. self.assertRaises(
  351. Exception,
  352. codecs.strict_errors,
  353. Exception("ouch")
  354. )
  355. # If the correct exception is passed in, "strict" raises it
  356. self.assertRaises(
  357. UnicodeEncodeError,
  358. codecs.strict_errors,
  359. UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")
  360. )
  361. self.assertRaises(
  362. UnicodeDecodeError,
  363. codecs.strict_errors,
  364. UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
  365. )
  366. self.assertRaises(
  367. UnicodeTranslateError,
  368. codecs.strict_errors,
  369. UnicodeTranslateError("\u3042", 0, 1, "ouch")
  370. )
  371. def test_badandgoodignoreexceptions(self):
  372. # "ignore" complains about a non-exception passed in
  373. self.assertRaises(
  374. TypeError,
  375. codecs.ignore_errors,
  376. 42
  377. )
  378. # "ignore" complains about the wrong exception type
  379. self.assertRaises(
  380. TypeError,
  381. codecs.ignore_errors,
  382. UnicodeError("ouch")
  383. )
  384. # If the correct exception is passed in, "ignore" returns an empty replacement
  385. self.assertEqual(
  386. codecs.ignore_errors(
  387. UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")),
  388. ("", 2)
  389. )
  390. self.assertEqual(
  391. codecs.ignore_errors(
  392. UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")),
  393. ("", 2)
  394. )
  395. self.assertEqual(
  396. codecs.ignore_errors(
  397. UnicodeTranslateError("a\u3042b", 1, 2, "ouch")),
  398. ("", 2)
  399. )
  400. def test_badandgoodreplaceexceptions(self):
  401. # "replace" complains about a non-exception passed in
  402. self.assertRaises(
  403. TypeError,
  404. codecs.replace_errors,
  405. 42
  406. )
  407. # "replace" complains about the wrong exception type
  408. self.assertRaises(
  409. TypeError,
  410. codecs.replace_errors,
  411. UnicodeError("ouch")
  412. )
  413. self.assertRaises(
  414. TypeError,
  415. codecs.replace_errors,
  416. BadObjectUnicodeEncodeError()
  417. )
  418. self.assertRaises(
  419. TypeError,
  420. codecs.replace_errors,
  421. BadObjectUnicodeDecodeError()
  422. )
  423. # With the correct exception, "replace" returns an "?" or "\ufffd" replacement
  424. self.assertEqual(
  425. codecs.replace_errors(
  426. UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")),
  427. ("?", 2)
  428. )
  429. self.assertEqual(
  430. codecs.replace_errors(
  431. UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")),
  432. ("\ufffd", 2)
  433. )
  434. self.assertEqual(
  435. codecs.replace_errors(
  436. UnicodeTranslateError("a\u3042b", 1, 2, "ouch")),
  437. ("\ufffd", 2)
  438. )
  439. def test_badandgoodxmlcharrefreplaceexceptions(self):
  440. # "xmlcharrefreplace" complains about a non-exception passed in
  441. self.assertRaises(
  442. TypeError,
  443. codecs.xmlcharrefreplace_errors,
  444. 42
  445. )
  446. # "xmlcharrefreplace" complains about the wrong exception types
  447. self.assertRaises(
  448. TypeError,
  449. codecs.xmlcharrefreplace_errors,
  450. UnicodeError("ouch")
  451. )
  452. # "xmlcharrefreplace" can only be used for encoding
  453. self.assertRaises(
  454. TypeError,
  455. codecs.xmlcharrefreplace_errors,
  456. UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
  457. )
  458. self.assertRaises(
  459. TypeError,
  460. codecs.xmlcharrefreplace_errors,
  461. UnicodeTranslateError("\u3042", 0, 1, "ouch")
  462. )
  463. # Use the correct exception
  464. cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 99999, 100000,
  465. 999999, 1000000)
  466. cs += (0xd800, 0xdfff)
  467. s = "".join(chr(c) for c in cs)
  468. self.assertEqual(
  469. codecs.xmlcharrefreplace_errors(
  470. UnicodeEncodeError("ascii", "a" + s + "b",
  471. 1, 1 + len(s), "ouch")
  472. ),
  473. ("".join("&#%d;" % c for c in cs), 1 + len(s))
  474. )
  475. def test_badandgoodbackslashreplaceexceptions(self):
  476. # "backslashreplace" complains about a non-exception passed in
  477. self.assertRaises(
  478. TypeError,
  479. codecs.backslashreplace_errors,
  480. 42
  481. )
  482. # "backslashreplace" complains about the wrong exception types
  483. self.assertRaises(
  484. TypeError,
  485. codecs.backslashreplace_errors,
  486. UnicodeError("ouch")
  487. )
  488. # Use the correct exception
  489. tests = [
  490. ("\u3042", "\\u3042"),
  491. ("\n", "\\x0a"),
  492. ("a", "\\x61"),
  493. ("\x00", "\\x00"),
  494. ("\xff", "\\xff"),
  495. ("\u0100", "\\u0100"),
  496. ("\uffff", "\\uffff"),
  497. ("\U00010000", "\\U00010000"),
  498. ("\U0010ffff", "\\U0010ffff"),
  499. # Lone surrogates
  500. ("\ud800", "\\ud800"),
  501. ("\udfff", "\\udfff"),
  502. ("\ud800\udfff", "\\ud800\\udfff"),
  503. ]
  504. for s, r in tests:
  505. with self.subTest(str=s):
  506. self.assertEqual(
  507. codecs.backslashreplace_errors(
  508. UnicodeEncodeError("ascii", "a" + s + "b",
  509. 1, 1 + len(s), "ouch")),
  510. (r, 1 + len(s))
  511. )
  512. self.assertEqual(
  513. codecs.backslashreplace_errors(
  514. UnicodeTranslateError("a" + s + "b",
  515. 1, 1 + len(s), "ouch")),
  516. (r, 1 + len(s))
  517. )
  518. tests = [
  519. (b"a", "\\x61"),
  520. (b"\n", "\\x0a"),
  521. (b"\x00", "\\x00"),
  522. (b"\xff", "\\xff"),
  523. ]
  524. for b, r in tests:
  525. with self.subTest(bytes=b):
  526. self.assertEqual(
  527. codecs.backslashreplace_errors(
  528. UnicodeDecodeError("ascii", bytearray(b"a" + b + b"b"),
  529. 1, 2, "ouch")),
  530. (r, 2)
  531. )
  532. def test_badandgoodnamereplaceexceptions(self):
  533. # "namereplace" complains about a non-exception passed in
  534. self.assertRaises(
  535. TypeError,
  536. codecs.namereplace_errors,
  537. 42
  538. )
  539. # "namereplace" complains about the wrong exception types
  540. self.assertRaises(
  541. TypeError,
  542. codecs.namereplace_errors,
  543. UnicodeError("ouch")
  544. )
  545. # "namereplace" can only be used for encoding
  546. self.assertRaises(
  547. TypeError,
  548. codecs.namereplace_errors,
  549. UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
  550. )
  551. self.assertRaises(
  552. TypeError,
  553. codecs.namereplace_errors,
  554. UnicodeTranslateError("\u3042", 0, 1, "ouch")
  555. )
  556. # Use the correct exception
  557. tests = [
  558. ("\u3042", "\\N{HIRAGANA LETTER A}"),
  559. ("\x00", "\\x00"),
  560. ("\ufbf9", "\\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH "
  561. "HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}"),
  562. ("\U000e007f", "\\N{CANCEL TAG}"),
  563. ("\U0010ffff", "\\U0010ffff"),
  564. # Lone surrogates
  565. ("\ud800", "\\ud800"),
  566. ("\udfff", "\\udfff"),
  567. ("\ud800\udfff", "\\ud800\\udfff"),
  568. ]
  569. for s, r in tests:
  570. with self.subTest(str=s):
  571. self.assertEqual(
  572. codecs.namereplace_errors(
  573. UnicodeEncodeError("ascii", "a" + s + "b",
  574. 1, 1 + len(s), "ouch")),
  575. (r, 1 + len(s))
  576. )
  577. def test_badandgoodsurrogateescapeexceptions(self):
  578. surrogateescape_errors = codecs.lookup_error('surrogateescape')
  579. # "surrogateescape" complains about a non-exception passed in
  580. self.assertRaises(
  581. TypeError,
  582. surrogateescape_errors,
  583. 42
  584. )
  585. # "surrogateescape" complains about the wrong exception types
  586. self.assertRaises(
  587. TypeError,
  588. surrogateescape_errors,
  589. UnicodeError("ouch")
  590. )
  591. # "surrogateescape" can not be used for translating
  592. self.assertRaises(
  593. TypeError,
  594. surrogateescape_errors,
  595. UnicodeTranslateError("\udc80", 0, 1, "ouch")
  596. )
  597. # Use the correct exception
  598. for s in ("a", "\udc7f", "\udd00"):
  599. with self.subTest(str=s):
  600. self.assertRaises(
  601. UnicodeEncodeError,
  602. surrogateescape_errors,
  603. UnicodeEncodeError("ascii", s, 0, 1, "ouch")
  604. )
  605. self.assertEqual(
  606. surrogateescape_errors(
  607. UnicodeEncodeError("ascii", "a\udc80b", 1, 2, "ouch")),
  608. (b"\x80", 2)
  609. )
  610. self.assertRaises(
  611. UnicodeDecodeError,
  612. surrogateescape_errors,
  613. UnicodeDecodeError("ascii", bytearray(b"a"), 0, 1, "ouch")
  614. )
  615. self.assertEqual(
  616. surrogateescape_errors(
  617. UnicodeDecodeError("ascii", bytearray(b"a\x80b"), 1, 2, "ouch")),
  618. ("\udc80", 2)
  619. )
  620. def test_badandgoodsurrogatepassexceptions(self):
  621. surrogatepass_errors = codecs.lookup_error('surrogatepass')
  622. # "surrogatepass" complains about a non-exception passed in
  623. self.assertRaises(
  624. TypeError,
  625. surrogatepass_errors,
  626. 42
  627. )
  628. # "surrogatepass" complains about the wrong exception types
  629. self.assertRaises(
  630. TypeError,
  631. surrogatepass_errors,
  632. UnicodeError("ouch")
  633. )
  634. # "surrogatepass" can not be used for translating
  635. self.assertRaises(
  636. TypeError,
  637. surrogatepass_errors,
  638. UnicodeTranslateError("\ud800", 0, 1, "ouch")
  639. )
  640. # Use the correct exception
  641. for enc in ("utf-8", "utf-16le", "utf-16be", "utf-32le", "utf-32be"):
  642. with self.subTest(encoding=enc):
  643. self.assertRaises(
  644. UnicodeEncodeError,
  645. surrogatepass_errors,
  646. UnicodeEncodeError(enc, "a", 0, 1, "ouch")
  647. )
  648. self.assertRaises(
  649. UnicodeDecodeError,
  650. surrogatepass_errors,
  651. UnicodeDecodeError(enc, "a".encode(enc), 0, 1, "ouch")
  652. )
  653. for s in ("\ud800", "\udfff", "\ud800\udfff"):
  654. with self.subTest(str=s):
  655. self.assertRaises(
  656. UnicodeEncodeError,
  657. surrogatepass_errors,
  658. UnicodeEncodeError("ascii", s, 0, len(s), "ouch")
  659. )
  660. tests = [
  661. ("utf-8", "\ud800", b'\xed\xa0\x80', 3),
  662. ("utf-16le", "\ud800", b'\x00\xd8', 2),
  663. ("utf-16be", "\ud800", b'\xd8\x00', 2),
  664. ("utf-32le", "\ud800", b'\x00\xd8\x00\x00', 4),
  665. ("utf-32be", "\ud800", b'\x00\x00\xd8\x00', 4),
  666. ("utf-8", "\udfff", b'\xed\xbf\xbf', 3),
  667. ("utf-16le", "\udfff", b'\xff\xdf', 2),
  668. ("utf-16be", "\udfff", b'\xdf\xff', 2),
  669. ("utf-32le", "\udfff", b'\xff\xdf\x00\x00', 4),
  670. ("utf-32be", "\udfff", b'\x00\x00\xdf\xff', 4),
  671. ("utf-8", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3),
  672. ("utf-16le", "\ud800\udfff", b'\x00\xd8\xff\xdf', 2),
  673. ("utf-16be", "\ud800\udfff", b'\xd8\x00\xdf\xff', 2),
  674. ("utf-32le", "\ud800\udfff", b'\x00\xd8\x00\x00\xff\xdf\x00\x00', 4),
  675. ("utf-32be", "\ud800\udfff", b'\x00\x00\xd8\x00\x00\x00\xdf\xff', 4),
  676. ]
  677. for enc, s, b, n in tests:
  678. with self.subTest(encoding=enc, str=s, bytes=b):
  679. self.assertEqual(
  680. surrogatepass_errors(
  681. UnicodeEncodeError(enc, "a" + s + "b",
  682. 1, 1 + len(s), "ouch")),
  683. (b, 1 + len(s))
  684. )
  685. self.assertEqual(
  686. surrogatepass_errors(
  687. UnicodeDecodeError(enc, bytearray(b"a" + b[:n] + b"b"),
  688. 1, 1 + n, "ouch")),
  689. (s[:1], 1 + n)
  690. )
  691. def test_badhandlerresults(self):
  692. results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
  693. encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
  694. for res in results:
  695. codecs.register_error("test.badhandler", lambda x: res)
  696. for enc in encs:
  697. self.assertRaises(
  698. TypeError,
  699. "\u3042".encode,
  700. enc,
  701. "test.badhandler"
  702. )
  703. for (enc, bytes) in (
  704. ("ascii", b"\xff"),
  705. ("utf-8", b"\xff"),
  706. ("utf-7", b"+x-"),
  707. ):
  708. self.assertRaises(
  709. TypeError,
  710. bytes.decode,
  711. enc,
  712. "test.badhandler"
  713. )
  714. def test_lookup(self):
  715. self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
  716. self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore"))
  717. self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
  718. self.assertEqual(
  719. codecs.xmlcharrefreplace_errors,
  720. codecs.lookup_error("xmlcharrefreplace")
  721. )
  722. self.assertEqual(
  723. codecs.backslashreplace_errors,
  724. codecs.lookup_error("backslashreplace")
  725. )
  726. self.assertEqual(
  727. codecs.namereplace_errors,
  728. codecs.lookup_error("namereplace")
  729. )
  730. def test_encode_nonascii_replacement(self):
  731. def handle(exc):
  732. if isinstance(exc, UnicodeEncodeError):
  733. return (repl, exc.end)
  734. raise TypeError("don't know how to handle %r" % exc)
  735. codecs.register_error("test.replacing", handle)
  736. for enc, input, repl in (
  737. ("ascii", "[¤]", "abc"),
  738. ("iso-8859-1", "[€]", "½¾"),
  739. ("iso-8859-15", "[¤]", "œŸ"),
  740. ):
  741. res = input.encode(enc, "test.replacing")
  742. self.assertEqual(res, ("[" + repl + "]").encode(enc))
  743. for enc, input, repl in (
  744. ("utf-8", "[\udc80]", "\U0001f40d"),
  745. ("utf-16", "[\udc80]", "\U0001f40d"),
  746. ("utf-32", "[\udc80]", "\U0001f40d"),
  747. ):
  748. with self.subTest(encoding=enc):
  749. with self.assertRaises(UnicodeEncodeError) as cm:
  750. input.encode(enc, "test.replacing")
  751. exc = cm.exception
  752. self.assertEqual(exc.start, 1)
  753. self.assertEqual(exc.end, 2)
  754. self.assertEqual(exc.object, input)
  755. def test_encode_unencodable_replacement(self):
  756. def unencrepl(exc):
  757. if isinstance(exc, UnicodeEncodeError):
  758. return (repl, exc.end)
  759. else:
  760. raise TypeError("don't know how to handle %r" % exc)
  761. codecs.register_error("test.unencreplhandler", unencrepl)
  762. for enc, input, repl in (
  763. ("ascii", "[¤]", "½"),
  764. ("iso-8859-1", "[€]", "œ"),
  765. ("iso-8859-15", "[¤]", "½"),
  766. ("utf-8", "[\udc80]", "\udcff"),
  767. ("utf-16", "[\udc80]", "\udcff"),
  768. ("utf-32", "[\udc80]", "\udcff"),
  769. ):
  770. with self.subTest(encoding=enc):
  771. with self.assertRaises(UnicodeEncodeError) as cm:
  772. input.encode(enc, "test.unencreplhandler")
  773. exc = cm.exception
  774. self.assertEqual(exc.start, 1)
  775. self.assertEqual(exc.end, 2)
  776. self.assertEqual(exc.object, input)
  777. def test_encode_bytes_replacement(self):
  778. def handle(exc):
  779. if isinstance(exc, UnicodeEncodeError):
  780. return (repl, exc.end)
  781. raise TypeError("don't know how to handle %r" % exc)
  782. codecs.register_error("test.replacing", handle)
  783. # It works even if the bytes sequence is not decodable.
  784. for enc, input, repl in (
  785. ("ascii", "[¤]", b"\xbd\xbe"),
  786. ("iso-8859-1", "[€]", b"\xbd\xbe"),
  787. ("iso-8859-15", "[¤]", b"\xbd\xbe"),
  788. ("utf-8", "[\udc80]", b"\xbd\xbe"),
  789. ("utf-16le", "[\udc80]", b"\xbd\xbe"),
  790. ("utf-16be", "[\udc80]", b"\xbd\xbe"),
  791. ("utf-32le", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
  792. ("utf-32be", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
  793. ):
  794. with self.subTest(encoding=enc):
  795. res = input.encode(enc, "test.replacing")
  796. self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc))
  797. def test_encode_odd_bytes_replacement(self):
  798. def handle(exc):
  799. if isinstance(exc, UnicodeEncodeError):
  800. return (repl, exc.end)
  801. raise TypeError("don't know how to handle %r" % exc)
  802. codecs.register_error("test.replacing", handle)
  803. input = "[\udc80]"
  804. # Tests in which the replacement bytestring contains not whole number
  805. # of code units.
  806. for enc, repl in (
  807. *itertools.product(("utf-16le", "utf-16be"),
  808. [b"a", b"abc"]),
  809. *itertools.product(("utf-32le", "utf-32be"),
  810. [b"a", b"ab", b"abc", b"abcde"]),
  811. ):
  812. with self.subTest(encoding=enc, repl=repl):
  813. with self.assertRaises(UnicodeEncodeError) as cm:
  814. input.encode(enc, "test.replacing")
  815. exc = cm.exception
  816. self.assertEqual(exc.start, 1)
  817. self.assertEqual(exc.end, 2)
  818. self.assertEqual(exc.object, input)
  819. self.assertEqual(exc.reason, "surrogates not allowed")
  820. def test_badregistercall(self):
  821. # enhance coverage of:
  822. # Modules/_codecsmodule.c::register_error()
  823. # Python/codecs.c::PyCodec_RegisterError()
  824. self.assertRaises(TypeError, codecs.register_error, 42)
  825. self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42)
  826. def test_badlookupcall(self):
  827. # enhance coverage of:
  828. # Modules/_codecsmodule.c::lookup_error()
  829. self.assertRaises(TypeError, codecs.lookup_error)
  830. def test_unknownhandler(self):
  831. # enhance coverage of:
  832. # Modules/_codecsmodule.c::lookup_error()
  833. self.assertRaises(LookupError, codecs.lookup_error, "test.unknown")
  834. def test_xmlcharrefvalues(self):
  835. # enhance coverage of:
  836. # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors()
  837. # and inline implementations
  838. v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000,
  839. 500000, 1000000)
  840. s = "".join([chr(x) for x in v])
  841. codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors)
  842. for enc in ("ascii", "iso-8859-15"):
  843. for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"):
  844. s.encode(enc, err)
  845. def test_decodehelper(self):
  846. # enhance coverage of:
  847. # Objects/unicodeobject.c::unicode_decode_call_errorhandler()
  848. # and callers
  849. self.assertRaises(LookupError, b"\xff".decode, "ascii", "test.unknown")
  850. def baddecodereturn1(exc):
  851. return 42
  852. codecs.register_error("test.baddecodereturn1", baddecodereturn1)
  853. self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn1")
  854. self.assertRaises(TypeError, b"\\".decode, "unicode-escape", "test.baddecodereturn1")
  855. self.assertRaises(TypeError, b"\\x0".decode, "unicode-escape", "test.baddecodereturn1")
  856. self.assertRaises(TypeError, b"\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
  857. self.assertRaises(TypeError, b"\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
  858. self.assertRaises(TypeError, b"\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
  859. def baddecodereturn2(exc):
  860. return ("?", None)
  861. codecs.register_error("test.baddecodereturn2", baddecodereturn2)
  862. self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn2")
  863. handler = PosReturn()
  864. codecs.register_error("test.posreturn", handler.handle)
  865. # Valid negative position
  866. handler.pos = -1
  867. self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
  868. # Valid negative position
  869. handler.pos = -2
  870. self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?><?>")
  871. # Negative position out of bounds
  872. handler.pos = -3
  873. self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
  874. # Valid positive position
  875. handler.pos = 1
  876. self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
  877. # Largest valid positive position (one beyond end of input)
  878. handler.pos = 2
  879. self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>")
  880. # Invalid positive position
  881. handler.pos = 3
  882. self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
  883. # Restart at the "0"
  884. handler.pos = 6
  885. self.assertEqual(b"\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), "<?>0")
  886. class D(dict):
  887. def __getitem__(self, key):
  888. raise ValueError
  889. self.assertRaises(UnicodeError, codecs.charmap_decode, b"\xff", "strict", {0xff: None})
  890. self.assertRaises(ValueError, codecs.charmap_decode, b"\xff", "strict", D())
  891. self.assertRaises(TypeError, codecs.charmap_decode, b"\xff", "strict", {0xff: sys.maxunicode+1})
  892. def test_encodehelper(self):
  893. # enhance coverage of:
  894. # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
  895. # and callers
  896. self.assertRaises(LookupError, "\xff".encode, "ascii", "test.unknown")
  897. def badencodereturn1(exc):
  898. return 42
  899. codecs.register_error("test.badencodereturn1", badencodereturn1)
  900. self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn1")
  901. def badencodereturn2(exc):
  902. return ("?", None)
  903. codecs.register_error("test.badencodereturn2", badencodereturn2)
  904. self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn2")
  905. handler = PosReturn()
  906. codecs.register_error("test.posreturn", handler.handle)
  907. # Valid negative position
  908. handler.pos = -1
  909. self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
  910. # Valid negative position
  911. handler.pos = -2
  912. self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?><?>")
  913. # Negative position out of bounds
  914. handler.pos = -3
  915. self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
  916. # Valid positive position
  917. handler.pos = 1
  918. self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
  919. # Largest valid positive position (one beyond end of input
  920. handler.pos = 2
  921. self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>")
  922. # Invalid positive position
  923. handler.pos = 3
  924. self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
  925. handler.pos = 0
  926. class D(dict):
  927. def __getitem__(self, key):
  928. raise ValueError
  929. for err in ("strict", "replace", "xmlcharrefreplace",
  930. "backslashreplace", "namereplace", "test.posreturn"):
  931. self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None})
  932. self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
  933. self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
  934. def test_decodehelper_bug36819(self):
  935. handler = RepeatedPosReturn("x")
  936. codecs.register_error("test.bug36819", handler.handle)
  937. testcases = [
  938. ("ascii", b"\xff"),
  939. ("utf-8", b"\xff"),
  940. ("utf-16be", b'\xdc\x80'),
  941. ("utf-32be", b'\x00\x00\xdc\x80'),
  942. ("iso-8859-6", b"\xff"),
  943. ]
  944. for enc, bad in testcases:
  945. input = "abcd".encode(enc) + bad
  946. with self.subTest(encoding=enc):
  947. handler.count = 50
  948. decoded = input.decode(enc, "test.bug36819")
  949. self.assertEqual(decoded, 'abcdx' * 51)
  950. def test_encodehelper_bug36819(self):
  951. handler = RepeatedPosReturn()
  952. codecs.register_error("test.bug36819", handler.handle)
  953. input = "abcd\udc80"
  954. encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"] # built-in
  955. encodings += ["iso-8859-15"] # charmap codec
  956. if sys.platform == 'win32':
  957. encodings = ["mbcs", "oem"] # code page codecs
  958. handler.repl = "\udcff"
  959. for enc in encodings:
  960. with self.subTest(encoding=enc):
  961. handler.count = 50
  962. with self.assertRaises(UnicodeEncodeError) as cm:
  963. input.encode(enc, "test.bug36819")
  964. exc = cm.exception
  965. self.assertEqual(exc.start, 4)
  966. self.assertEqual(exc.end, 5)
  967. self.assertEqual(exc.object, input)
  968. if sys.platform == "win32":
  969. handler.count = 50
  970. with self.assertRaises(UnicodeEncodeError) as cm:
  971. codecs.code_page_encode(437, input, "test.bug36819")
  972. exc = cm.exception
  973. self.assertEqual(exc.start, 4)
  974. self.assertEqual(exc.end, 5)
  975. self.assertEqual(exc.object, input)
  976. handler.repl = "x"
  977. for enc in encodings:
  978. with self.subTest(encoding=enc):
  979. # The interpreter should segfault after a handful of attempts.
  980. # 50 was chosen to try to ensure a segfault without a fix,
  981. # but not OOM a machine with one.
  982. handler.count = 50
  983. encoded = input.encode(enc, "test.bug36819")
  984. self.assertEqual(encoded.decode(enc), "abcdx" * 51)
  985. if sys.platform == "win32":
  986. handler.count = 50
  987. encoded = codecs.code_page_encode(437, input, "test.bug36819")
  988. self.assertEqual(encoded[0].decode(), "abcdx" * 51)
  989. self.assertEqual(encoded[1], len(input))
  990. def test_translatehelper(self):
  991. # enhance coverage of:
  992. # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
  993. # and callers
  994. # (Unfortunately the errors argument is not directly accessible
  995. # from Python, so we can't test that much)
  996. class D(dict):
  997. def __getitem__(self, key):
  998. raise ValueError
  999. #self.assertRaises(ValueError, "\xff".translate, D())
  1000. self.assertRaises(ValueError, "\xff".translate, {0xff: sys.maxunicode+1})
  1001. self.assertRaises(TypeError, "\xff".translate, {0xff: ()})
  1002. def test_bug828737(self):
  1003. charmap = {
  1004. ord("&"): "&amp;",
  1005. ord("<"): "&lt;",
  1006. ord(">"): "&gt;",
  1007. ord('"'): "&quot;",
  1008. }
  1009. for n in (1, 10, 100, 1000):
  1010. text = 'abc<def>ghi'*n
  1011. text.translate(charmap)
  1012. def test_mutatingdecodehandler(self):
  1013. baddata = [
  1014. ("ascii", b"\xff"),
  1015. ("utf-7", b"++"),
  1016. ("utf-8", b"\xff"),
  1017. ("utf-16", b"\xff"),
  1018. ("utf-32", b"\xff"),
  1019. ("unicode-escape", b"\\u123g"),
  1020. ("raw-unicode-escape", b"\\u123g"),
  1021. ]
  1022. def replacing(exc):
  1023. if isinstance(exc, UnicodeDecodeError):
  1024. exc.object = 42
  1025. return ("\u4242", 0)
  1026. else:
  1027. raise TypeError("don't know how to handle %r" % exc)
  1028. codecs.register_error("test.replacing", replacing)
  1029. for (encoding, data) in baddata:
  1030. with self.assertRaises(TypeError):
  1031. data.decode(encoding, "test.replacing")
  1032. def mutating(exc):
  1033. if isinstance(exc, UnicodeDecodeError):
  1034. exc.object = b""
  1035. return ("\u4242", 0)
  1036. else:
  1037. raise TypeError("don't know how to handle %r" % exc)
  1038. codecs.register_error("test.mutating", mutating)
  1039. # If the decoder doesn't pick up the modified input the following
  1040. # will lead to an endless loop
  1041. for (encoding, data) in baddata:
  1042. self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
  1043. # issue32583
  1044. def test_crashing_decode_handler(self):
  1045. # better generating one more character to fill the extra space slot
  1046. # so in debug build it can steadily fail
  1047. def forward_shorter_than_end(exc):
  1048. if isinstance(exc, UnicodeDecodeError):
  1049. # size one character, 0 < forward < exc.end
  1050. return ('\ufffd', exc.start+1)
  1051. else:
  1052. raise TypeError("don't know how to handle %r" % exc)
  1053. codecs.register_error(
  1054. "test.forward_shorter_than_end", forward_shorter_than_end)
  1055. self.assertEqual(
  1056. b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode(
  1057. 'utf-16-le', 'test.forward_shorter_than_end'),
  1058. '\ufffd\ufffd\ufffd\ufffd\xd8\x00'
  1059. )
  1060. self.assertEqual(
  1061. b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode(
  1062. 'utf-16-be', 'test.forward_shorter_than_end'),
  1063. '\ufffd\ufffd\ufffd\ufffd\xd8\x00'
  1064. )
  1065. self.assertEqual(
  1066. b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode(
  1067. 'utf-32-le', 'test.forward_shorter_than_end'),
  1068. '\ufffd\ufffd\ufffd\u1111\x00'
  1069. )
  1070. self.assertEqual(
  1071. b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode(
  1072. 'utf-32-be', 'test.forward_shorter_than_end'),
  1073. '\ufffd\ufffd\ufffd\u1111\x00'
  1074. )
  1075. def replace_with_long(exc):
  1076. if isinstance(exc, UnicodeDecodeError):
  1077. exc.object = b"\x00" * 8
  1078. return ('\ufffd', exc.start)
  1079. else:
  1080. raise TypeError("don't know how to handle %r" % exc)
  1081. codecs.register_error("test.replace_with_long", replace_with_long)
  1082. self.assertEqual(
  1083. b'\x00'.decode('utf-16', 'test.replace_with_long'),
  1084. '\ufffd\x00\x00\x00\x00'
  1085. )
  1086. self.assertEqual(
  1087. b'\x00'.decode('utf-32', 'test.replace_with_long'),
  1088. '\ufffd\x00\x00'
  1089. )
  1090. def test_fake_error_class(self):
  1091. handlers = [
  1092. codecs.strict_errors,
  1093. codecs.ignore_errors,
  1094. codecs.replace_errors,
  1095. codecs.backslashreplace_errors,
  1096. codecs.namereplace_errors,
  1097. codecs.xmlcharrefreplace_errors,
  1098. codecs.lookup_error('surrogateescape'),
  1099. codecs.lookup_error('surrogatepass'),
  1100. ]
  1101. for cls in UnicodeEncodeError, UnicodeDecodeError, UnicodeTranslateError:
  1102. class FakeUnicodeError(str):
  1103. __class__ = cls
  1104. for handler in handlers:
  1105. with self.subTest(handler=handler, error_class=cls):
  1106. self.assertRaises(TypeError, handler, FakeUnicodeError())
  1107. class FakeUnicodeError(Exception):
  1108. __class__ = cls
  1109. for handler in handlers:
  1110. with self.subTest(handler=handler, error_class=cls):
  1111. with self.assertRaises((TypeError, FakeUnicodeError)):
  1112. handler(FakeUnicodeError())
  1113. if __name__ == "__main__":
  1114. unittest.main()