test_html.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. """
  2. Tests for the html module functions.
  3. """
  4. import html
  5. import unittest
  6. class HtmlTests(unittest.TestCase):
  7. def test_escape(self):
  8. self.assertEqual(
  9. html.escape('\'<script>"&foo;"</script>\''),
  10. '&#x27;&lt;script&gt;&quot;&amp;foo;&quot;&lt;/script&gt;&#x27;')
  11. self.assertEqual(
  12. html.escape('\'<script>"&foo;"</script>\'', False),
  13. '\'&lt;script&gt;"&amp;foo;"&lt;/script&gt;\'')
  14. def test_unescape(self):
  15. numeric_formats = ['&#%d', '&#%d;', '&#x%x', '&#x%x;']
  16. errmsg = 'unescape(%r) should have returned %r'
  17. def check(text, expected):
  18. self.assertEqual(html.unescape(text), expected,
  19. msg=errmsg % (text, expected))
  20. def check_num(num, expected):
  21. for format in numeric_formats:
  22. text = format % num
  23. self.assertEqual(html.unescape(text), expected,
  24. msg=errmsg % (text, expected))
  25. # check text with no character references
  26. check('no character references', 'no character references')
  27. # check & followed by invalid chars
  28. check('&\n&\t& &&', '&\n&\t& &&')
  29. # check & followed by numbers and letters
  30. check('&0 &9 &a &0; &9; &a;', '&0 &9 &a &0; &9; &a;')
  31. # check incomplete entities at the end of the string
  32. for x in ['&', '&#', '&#x', '&#X', '&#y', '&#xy', '&#Xy']:
  33. check(x, x)
  34. check(x+';', x+';')
  35. # check several combinations of numeric character references,
  36. # possibly followed by different characters
  37. formats = ['&#%d', '&#%07d', '&#%d;', '&#%07d;',
  38. '&#x%x', '&#x%06x', '&#x%x;', '&#x%06x;',
  39. '&#x%X', '&#x%06X', '&#X%x;', '&#X%06x;']
  40. for num, char in zip([65, 97, 34, 38, 0x2603, 0x101234],
  41. ['A', 'a', '"', '&', '\u2603', '\U00101234']):
  42. for s in formats:
  43. check(s % num, char)
  44. for end in [' ', 'X']:
  45. check((s+end) % num, char+end)
  46. # check invalid code points
  47. for cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000]:
  48. check_num(cp, '\uFFFD')
  49. # check more invalid code points
  50. for cp in [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff]:
  51. check_num(cp, '')
  52. # check invalid numbers
  53. for num, ch in zip([0x0d, 0x80, 0x95, 0x9d], '\r\u20ac\u2022\x9d'):
  54. check_num(num, ch)
  55. # check small numbers
  56. check_num(0, '\uFFFD')
  57. check_num(9, '\t')
  58. # check a big number
  59. check_num(1000000000000000000, '\uFFFD')
  60. # check that multiple trailing semicolons are handled correctly
  61. for e in ['&quot;;', '&#34;;', '&#x22;;', '&#X22;;']:
  62. check(e, '";')
  63. # check that semicolons in the middle don't create problems
  64. for e in ['&quot;quot;', '&#34;quot;', '&#x22;quot;', '&#X22;quot;']:
  65. check(e, '"quot;')
  66. # check triple adjacent charrefs
  67. for e in ['&quot', '&#34', '&#x22', '&#X22']:
  68. check(e*3, '"""')
  69. check((e+';')*3, '"""')
  70. # check that the case is respected
  71. for e in ['&amp', '&amp;', '&AMP', '&AMP;']:
  72. check(e, '&')
  73. for e in ['&Amp', '&Amp;']:
  74. check(e, e)
  75. # check that non-existent named entities are returned unchanged
  76. check('&svadilfari;', '&svadilfari;')
  77. # the following examples are in the html5 specs
  78. check('&notit', '¬it')
  79. check('&notit;', '¬it;')
  80. check('&notin', '¬in')
  81. check('&notin;', '∉')
  82. # a similar example with a long name
  83. check('&notReallyAnExistingNamedCharacterReference;',
  84. '¬ReallyAnExistingNamedCharacterReference;')
  85. # longest valid name
  86. check('&CounterClockwiseContourIntegral;', '∳')
  87. # check a charref that maps to two unicode chars
  88. check('&acE;', '\u223E\u0333')
  89. check('&acE', '&acE')
  90. # see #12888
  91. check('&#123; ' * 1050, '{ ' * 1050)
  92. # see #15156
  93. check('&Eacuteric&Eacute;ric&alphacentauri&alpha;centauri',
  94. 'ÉricÉric&alphacentauriαcentauri')
  95. check('&co;', '&co;')
  96. if __name__ == '__main__':
  97. unittest.main()