test_htmlparser.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791
  1. """Tests for HTMLParser.py."""
  2. import html.parser
  3. import pprint
  4. import unittest
  5. class EventCollector(html.parser.HTMLParser):
  6. def __init__(self, *args, **kw):
  7. self.events = []
  8. self.append = self.events.append
  9. html.parser.HTMLParser.__init__(self, *args, **kw)
  10. def get_events(self):
  11. # Normalize the list of events so that buffer artefacts don't
  12. # separate runs of contiguous characters.
  13. L = []
  14. prevtype = None
  15. for event in self.events:
  16. type = event[0]
  17. if type == prevtype == "data":
  18. L[-1] = ("data", L[-1][1] + event[1])
  19. else:
  20. L.append(event)
  21. prevtype = type
  22. self.events = L
  23. return L
  24. # structure markup
  25. def handle_starttag(self, tag, attrs):
  26. self.append(("starttag", tag, attrs))
  27. def handle_startendtag(self, tag, attrs):
  28. self.append(("startendtag", tag, attrs))
  29. def handle_endtag(self, tag):
  30. self.append(("endtag", tag))
  31. # all other markup
  32. def handle_comment(self, data):
  33. self.append(("comment", data))
  34. def handle_charref(self, data):
  35. self.append(("charref", data))
  36. def handle_data(self, data):
  37. self.append(("data", data))
  38. def handle_decl(self, data):
  39. self.append(("decl", data))
  40. def handle_entityref(self, data):
  41. self.append(("entityref", data))
  42. def handle_pi(self, data):
  43. self.append(("pi", data))
  44. def unknown_decl(self, decl):
  45. self.append(("unknown decl", decl))
  46. class EventCollectorExtra(EventCollector):
  47. def handle_starttag(self, tag, attrs):
  48. EventCollector.handle_starttag(self, tag, attrs)
  49. self.append(("starttag_text", self.get_starttag_text()))
  50. class EventCollectorCharrefs(EventCollector):
  51. def handle_charref(self, data):
  52. self.fail('This should never be called with convert_charrefs=True')
  53. def handle_entityref(self, data):
  54. self.fail('This should never be called with convert_charrefs=True')
  55. class TestCaseBase(unittest.TestCase):
  56. def get_collector(self):
  57. return EventCollector(convert_charrefs=False)
  58. def _run_check(self, source, expected_events, collector=None):
  59. if collector is None:
  60. collector = self.get_collector()
  61. parser = collector
  62. for s in source:
  63. parser.feed(s)
  64. parser.close()
  65. events = parser.get_events()
  66. if events != expected_events:
  67. self.fail("received events did not match expected events" +
  68. "\nSource:\n" + repr(source) +
  69. "\nExpected:\n" + pprint.pformat(expected_events) +
  70. "\nReceived:\n" + pprint.pformat(events))
  71. def _run_check_extra(self, source, events):
  72. self._run_check(source, events,
  73. EventCollectorExtra(convert_charrefs=False))
  74. class HTMLParserTestCase(TestCaseBase):
  75. def test_processing_instruction_only(self):
  76. self._run_check("<?processing instruction>", [
  77. ("pi", "processing instruction"),
  78. ])
  79. self._run_check("<?processing instruction ?>", [
  80. ("pi", "processing instruction ?"),
  81. ])
  82. def test_simple_html(self):
  83. self._run_check("""
  84. <!DOCTYPE html PUBLIC 'foo'>
  85. <HTML>&entity;&#32;
  86. <!--comment1a
  87. -></foo><bar>&lt;<?pi?></foo<bar
  88. comment1b-->
  89. <Img sRc='Bar' isMAP>sample
  90. text
  91. &#x201C;
  92. <!--comment2a-- --comment2b-->
  93. </Html>
  94. """, [
  95. ("data", "\n"),
  96. ("decl", "DOCTYPE html PUBLIC 'foo'"),
  97. ("data", "\n"),
  98. ("starttag", "html", []),
  99. ("entityref", "entity"),
  100. ("charref", "32"),
  101. ("data", "\n"),
  102. ("comment", "comment1a\n-></foo><bar>&lt;<?pi?></foo<bar\ncomment1b"),
  103. ("data", "\n"),
  104. ("starttag", "img", [("src", "Bar"), ("ismap", None)]),
  105. ("data", "sample\ntext\n"),
  106. ("charref", "x201C"),
  107. ("data", "\n"),
  108. ("comment", "comment2a-- --comment2b"),
  109. ("data", "\n"),
  110. ("endtag", "html"),
  111. ("data", "\n"),
  112. ])
  113. def test_malformatted_charref(self):
  114. self._run_check("<p>&#bad;</p>", [
  115. ("starttag", "p", []),
  116. ("data", "&#bad;"),
  117. ("endtag", "p"),
  118. ])
  119. # add the [] as a workaround to avoid buffering (see #20288)
  120. self._run_check(["<div>&#bad;</div>"], [
  121. ("starttag", "div", []),
  122. ("data", "&#bad;"),
  123. ("endtag", "div"),
  124. ])
  125. def test_unclosed_entityref(self):
  126. self._run_check("&entityref foo", [
  127. ("entityref", "entityref"),
  128. ("data", " foo"),
  129. ])
  130. def test_bad_nesting(self):
  131. # Strangely, this *is* supposed to test that overlapping
  132. # elements are allowed. HTMLParser is more geared toward
  133. # lexing the input that parsing the structure.
  134. self._run_check("<a><b></a></b>", [
  135. ("starttag", "a", []),
  136. ("starttag", "b", []),
  137. ("endtag", "a"),
  138. ("endtag", "b"),
  139. ])
  140. def test_bare_ampersands(self):
  141. self._run_check("this text & contains & ampersands &", [
  142. ("data", "this text & contains & ampersands &"),
  143. ])
  144. def test_bare_pointy_brackets(self):
  145. self._run_check("this < text > contains < bare>pointy< brackets", [
  146. ("data", "this < text > contains < bare>pointy< brackets"),
  147. ])
  148. def test_starttag_end_boundary(self):
  149. self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
  150. self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])])
  151. def test_buffer_artefacts(self):
  152. output = [("starttag", "a", [("b", "<")])]
  153. self._run_check(["<a b='<'>"], output)
  154. self._run_check(["<a ", "b='<'>"], output)
  155. self._run_check(["<a b", "='<'>"], output)
  156. self._run_check(["<a b=", "'<'>"], output)
  157. self._run_check(["<a b='<", "'>"], output)
  158. self._run_check(["<a b='<'", ">"], output)
  159. output = [("starttag", "a", [("b", ">")])]
  160. self._run_check(["<a b='>'>"], output)
  161. self._run_check(["<a ", "b='>'>"], output)
  162. self._run_check(["<a b", "='>'>"], output)
  163. self._run_check(["<a b=", "'>'>"], output)
  164. self._run_check(["<a b='>", "'>"], output)
  165. self._run_check(["<a b='>'", ">"], output)
  166. output = [("comment", "abc")]
  167. self._run_check(["", "<!--abc-->"], output)
  168. self._run_check(["<", "!--abc-->"], output)
  169. self._run_check(["<!", "--abc-->"], output)
  170. self._run_check(["<!-", "-abc-->"], output)
  171. self._run_check(["<!--", "abc-->"], output)
  172. self._run_check(["<!--a", "bc-->"], output)
  173. self._run_check(["<!--ab", "c-->"], output)
  174. self._run_check(["<!--abc", "-->"], output)
  175. self._run_check(["<!--abc-", "->"], output)
  176. self._run_check(["<!--abc--", ">"], output)
  177. self._run_check(["<!--abc-->", ""], output)
  178. def test_valid_doctypes(self):
  179. # from http://www.w3.org/QA/2002/04/valid-dtd-list.html
  180. dtds = ['HTML', # HTML5 doctype
  181. ('HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" '
  182. '"http://www.w3.org/TR/html4/strict.dtd"'),
  183. ('HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" '
  184. '"http://www.w3.org/TR/html4/loose.dtd"'),
  185. ('html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" '
  186. '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"'),
  187. ('html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN" '
  188. '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"'),
  189. ('math PUBLIC "-//W3C//DTD MathML 2.0//EN" '
  190. '"http://www.w3.org/Math/DTD/mathml2/mathml2.dtd"'),
  191. ('html PUBLIC "-//W3C//DTD '
  192. 'XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN" '
  193. '"http://www.w3.org/2002/04/xhtml-math-svg/xhtml-math-svg.dtd"'),
  194. ('svg PUBLIC "-//W3C//DTD SVG 1.1//EN" '
  195. '"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"'),
  196. 'html PUBLIC "-//IETF//DTD HTML 2.0//EN"',
  197. 'html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"']
  198. for dtd in dtds:
  199. self._run_check("<!DOCTYPE %s>" % dtd,
  200. [('decl', 'DOCTYPE ' + dtd)])
  201. def test_startendtag(self):
  202. self._run_check("<p/>", [
  203. ("startendtag", "p", []),
  204. ])
  205. self._run_check("<p></p>", [
  206. ("starttag", "p", []),
  207. ("endtag", "p"),
  208. ])
  209. self._run_check("<p><img src='foo' /></p>", [
  210. ("starttag", "p", []),
  211. ("startendtag", "img", [("src", "foo")]),
  212. ("endtag", "p"),
  213. ])
  214. def test_get_starttag_text(self):
  215. s = """<foo:bar \n one="1"\ttwo=2 >"""
  216. self._run_check_extra(s, [
  217. ("starttag", "foo:bar", [("one", "1"), ("two", "2")]),
  218. ("starttag_text", s)])
  219. def test_cdata_content(self):
  220. contents = [
  221. '<!-- not a comment --> &not-an-entity-ref;',
  222. "<not a='start tag'>",
  223. '<a href="" /> <p> <span></span>',
  224. 'foo = "</scr" + "ipt>";',
  225. 'foo = "</SCRIPT" + ">";',
  226. 'foo = <\n/script> ',
  227. '<!-- document.write("</scr" + "ipt>"); -->',
  228. ('\n//<![CDATA[\n'
  229. 'document.write(\'<s\'+\'cript type="text/javascript" '
  230. 'src="http://www.example.org/r=\'+new '
  231. 'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'),
  232. '\n<!-- //\nvar foo = 3.14;\n// -->\n',
  233. 'foo = "</sty" + "le>";',
  234. '<!-- \u2603 -->',
  235. # these two should be invalid according to the HTML 5 spec,
  236. # section 8.1.2.2
  237. #'foo = </\nscript>',
  238. #'foo = </ script>',
  239. ]
  240. elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
  241. for content in contents:
  242. for element in elements:
  243. element_lower = element.lower()
  244. s = '<{element}>{content}</{element}>'.format(element=element,
  245. content=content)
  246. self._run_check(s, [("starttag", element_lower, []),
  247. ("data", content),
  248. ("endtag", element_lower)])
  249. def test_cdata_with_closing_tags(self):
  250. # see issue #13358
  251. # make sure that HTMLParser calls handle_data only once for each CDATA.
  252. # The normal event collector normalizes the events in get_events,
  253. # so we override it to return the original list of events.
  254. class Collector(EventCollector):
  255. def get_events(self):
  256. return self.events
  257. content = """<!-- not a comment --> &not-an-entity-ref;
  258. <a href="" /> </p><p> <span></span></style>
  259. '</script' + '>'"""
  260. for element in [' script', 'script ', ' script ',
  261. '\nscript', 'script\n', '\nscript\n']:
  262. element_lower = element.lower().strip()
  263. s = '<script>{content}</{element}>'.format(element=element,
  264. content=content)
  265. self._run_check(s, [("starttag", element_lower, []),
  266. ("data", content),
  267. ("endtag", element_lower)],
  268. collector=Collector(convert_charrefs=False))
  269. def test_comments(self):
  270. html = ("<!-- I'm a valid comment -->"
  271. '<!--me too!-->'
  272. '<!------>'
  273. '<!---->'
  274. '<!----I have many hyphens---->'
  275. '<!-- I have a > in the middle -->'
  276. '<!-- and I have -- in the middle! -->')
  277. expected = [('comment', " I'm a valid comment "),
  278. ('comment', 'me too!'),
  279. ('comment', '--'),
  280. ('comment', ''),
  281. ('comment', '--I have many hyphens--'),
  282. ('comment', ' I have a > in the middle '),
  283. ('comment', ' and I have -- in the middle! ')]
  284. self._run_check(html, expected)
  285. def test_condcoms(self):
  286. html = ('<!--[if IE & !(lte IE 8)]>aren\'t<![endif]-->'
  287. '<!--[if IE 8]>condcoms<![endif]-->'
  288. '<!--[if lte IE 7]>pretty?<![endif]-->')
  289. expected = [('comment', "[if IE & !(lte IE 8)]>aren't<![endif]"),
  290. ('comment', '[if IE 8]>condcoms<![endif]'),
  291. ('comment', '[if lte IE 7]>pretty?<![endif]')]
  292. self._run_check(html, expected)
  293. def test_convert_charrefs(self):
  294. # default value for convert_charrefs is now True
  295. collector = lambda: EventCollectorCharrefs()
  296. self.assertTrue(collector().convert_charrefs)
  297. charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
  298. # check charrefs in the middle of the text/attributes
  299. expected = [('starttag', 'a', [('href', 'foo"zar')]),
  300. ('data', 'a"z'), ('endtag', 'a')]
  301. for charref in charrefs:
  302. self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
  303. expected, collector=collector())
  304. # check charrefs at the beginning/end of the text/attributes
  305. expected = [('data', '"'),
  306. ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
  307. ('data', '"'), ('endtag', 'a'), ('data', '"')]
  308. for charref in charrefs:
  309. self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
  310. '{0}</a>{0}'.format(charref),
  311. expected, collector=collector())
  312. # check charrefs in <script>/<style> elements
  313. for charref in charrefs:
  314. text = 'X'.join([charref]*3)
  315. expected = [('data', '"'),
  316. ('starttag', 'script', []), ('data', text),
  317. ('endtag', 'script'), ('data', '"'),
  318. ('starttag', 'style', []), ('data', text),
  319. ('endtag', 'style'), ('data', '"')]
  320. self._run_check('{1}<script>{0}</script>{1}'
  321. '<style>{0}</style>{1}'.format(text, charref),
  322. expected, collector=collector())
  323. # check truncated charrefs at the end of the file
  324. html = '&quo &# &#x'
  325. for x in range(1, len(html)):
  326. self._run_check(html[:x], [('data', html[:x])],
  327. collector=collector())
  328. # check a string with no charrefs
  329. self._run_check('no charrefs here', [('data', 'no charrefs here')],
  330. collector=collector())
  331. # the remaining tests were for the "tolerant" parser (which is now
  332. # the default), and check various kind of broken markup
  333. def test_tolerant_parsing(self):
  334. self._run_check('<html <html>te>>xt&a<<bc</a></html>\n'
  335. '<img src="URL><//img></html</html>', [
  336. ('starttag', 'html', [('<html', None)]),
  337. ('data', 'te>>xt'),
  338. ('entityref', 'a'),
  339. ('data', '<'),
  340. ('starttag', 'bc<', [('a', None)]),
  341. ('endtag', 'html'),
  342. ('data', '\n<img src="URL>'),
  343. ('comment', '/img'),
  344. ('endtag', 'html<')])
  345. def test_starttag_junk_chars(self):
  346. self._run_check("</>", [])
  347. self._run_check("</$>", [('comment', '$')])
  348. self._run_check("</", [('data', '</')])
  349. self._run_check("</a", [('data', '</a')])
  350. self._run_check("<a<a>", [('starttag', 'a<a', [])])
  351. self._run_check("</a<a>", [('endtag', 'a<a')])
  352. self._run_check("<!", [('data', '<!')])
  353. self._run_check("<a", [('data', '<a')])
  354. self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
  355. self._run_check("<a foo='bar", [('data', "<a foo='bar")])
  356. self._run_check("<a foo='>'", [('data', "<a foo='>'")])
  357. self._run_check("<a foo='>", [('data', "<a foo='>")])
  358. self._run_check("<a$>", [('starttag', 'a$', [])])
  359. self._run_check("<a$b>", [('starttag', 'a$b', [])])
  360. self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
  361. self._run_check("<a$b >", [('starttag', 'a$b', [])])
  362. self._run_check("<a$b />", [('startendtag', 'a$b', [])])
  363. def test_slashes_in_starttag(self):
  364. self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
  365. html = ('<img width=902 height=250px '
  366. 'src="/sites/default/files/images/homepage/foo.jpg" '
  367. '/*what am I doing here*/ />')
  368. expected = [(
  369. 'startendtag', 'img',
  370. [('width', '902'), ('height', '250px'),
  371. ('src', '/sites/default/files/images/homepage/foo.jpg'),
  372. ('*what', None), ('am', None), ('i', None),
  373. ('doing', None), ('here*', None)]
  374. )]
  375. self._run_check(html, expected)
  376. html = ('<a / /foo/ / /=/ / /bar/ / />'
  377. '<a / /foo/ / /=/ / /bar/ / >')
  378. expected = [
  379. ('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]),
  380. ('starttag', 'a', [('foo', None), ('=', None), ('bar', None)])
  381. ]
  382. self._run_check(html, expected)
  383. #see issue #14538
  384. html = ('<meta><meta / ><meta // ><meta / / >'
  385. '<meta/><meta /><meta //><meta//>')
  386. expected = [
  387. ('starttag', 'meta', []), ('starttag', 'meta', []),
  388. ('starttag', 'meta', []), ('starttag', 'meta', []),
  389. ('startendtag', 'meta', []), ('startendtag', 'meta', []),
  390. ('startendtag', 'meta', []), ('startendtag', 'meta', []),
  391. ]
  392. self._run_check(html, expected)
  393. def test_declaration_junk_chars(self):
  394. self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')])
  395. def test_illegal_declarations(self):
  396. self._run_check('<!spacer type="block" height="25">',
  397. [('comment', 'spacer type="block" height="25"')])
  398. def test_invalid_end_tags(self):
  399. # A collection of broken end tags. <br> is used as separator.
  400. # see http://www.w3.org/TR/html5/tokenization.html#end-tag-open-state
  401. # and #13993
  402. html = ('<br></label</p><br></div end tmAd-leaderBoard><br></<h4><br>'
  403. '</li class="unit"><br></li\r\n\t\t\t\t\t\t</ul><br></><br>')
  404. expected = [('starttag', 'br', []),
  405. # < is part of the name, / is discarded, p is an attribute
  406. ('endtag', 'label<'),
  407. ('starttag', 'br', []),
  408. # text and attributes are discarded
  409. ('endtag', 'div'),
  410. ('starttag', 'br', []),
  411. # comment because the first char after </ is not a-zA-Z
  412. ('comment', '<h4'),
  413. ('starttag', 'br', []),
  414. # attributes are discarded
  415. ('endtag', 'li'),
  416. ('starttag', 'br', []),
  417. # everything till ul (included) is discarded
  418. ('endtag', 'li'),
  419. ('starttag', 'br', []),
  420. # </> is ignored
  421. ('starttag', 'br', [])]
  422. self._run_check(html, expected)
  423. def test_broken_invalid_end_tag(self):
  424. # This is technically wrong (the "> shouldn't be included in the 'data')
  425. # but is probably not worth fixing it (in addition to all the cases of
  426. # the previous test, it would require a full attribute parsing).
  427. # see #13993
  428. html = '<b>This</b attr=">"> confuses the parser'
  429. expected = [('starttag', 'b', []),
  430. ('data', 'This'),
  431. ('endtag', 'b'),
  432. ('data', '"> confuses the parser')]
  433. self._run_check(html, expected)
  434. def test_correct_detection_of_start_tags(self):
  435. # see #13273
  436. html = ('<div style="" ><b>The <a href="some_url">rain</a> '
  437. '<br /> in <span>Spain</span></b></div>')
  438. expected = [
  439. ('starttag', 'div', [('style', '')]),
  440. ('starttag', 'b', []),
  441. ('data', 'The '),
  442. ('starttag', 'a', [('href', 'some_url')]),
  443. ('data', 'rain'),
  444. ('endtag', 'a'),
  445. ('data', ' '),
  446. ('startendtag', 'br', []),
  447. ('data', ' in '),
  448. ('starttag', 'span', []),
  449. ('data', 'Spain'),
  450. ('endtag', 'span'),
  451. ('endtag', 'b'),
  452. ('endtag', 'div')
  453. ]
  454. self._run_check(html, expected)
  455. html = '<div style="", foo = "bar" ><b>The <a href="some_url">rain</a>'
  456. expected = [
  457. ('starttag', 'div', [('style', ''), (',', None), ('foo', 'bar')]),
  458. ('starttag', 'b', []),
  459. ('data', 'The '),
  460. ('starttag', 'a', [('href', 'some_url')]),
  461. ('data', 'rain'),
  462. ('endtag', 'a'),
  463. ]
  464. self._run_check(html, expected)
  465. def test_EOF_in_charref(self):
  466. # see #17802
  467. # This test checks that the UnboundLocalError reported in the issue
  468. # is not raised, however I'm not sure the returned values are correct.
  469. # Maybe HTMLParser should use self.unescape for these
  470. data = [
  471. ('a&', [('data', 'a&')]),
  472. ('a&b', [('data', 'ab')]),
  473. ('a&b ', [('data', 'a'), ('entityref', 'b'), ('data', ' ')]),
  474. ('a&b;', [('data', 'a'), ('entityref', 'b')]),
  475. ]
  476. for html, expected in data:
  477. self._run_check(html, expected)
  478. def test_broken_comments(self):
  479. html = ('<! not really a comment >'
  480. '<! not a comment either -->'
  481. '<! -- close enough -->'
  482. '<!><!<-- this was an empty comment>'
  483. '<!!! another bogus comment !!!>')
  484. expected = [
  485. ('comment', ' not really a comment '),
  486. ('comment', ' not a comment either --'),
  487. ('comment', ' -- close enough --'),
  488. ('comment', ''),
  489. ('comment', '<-- this was an empty comment'),
  490. ('comment', '!! another bogus comment !!!'),
  491. ]
  492. self._run_check(html, expected)
  493. def test_broken_condcoms(self):
  494. # these condcoms are missing the '--' after '<!' and before the '>'
  495. html = ('<![if !(IE)]>broken condcom<![endif]>'
  496. '<![if ! IE]><link href="favicon.tiff"/><![endif]>'
  497. '<![if !IE 6]><img src="firefox.png" /><![endif]>'
  498. '<![if !ie 6]><b>foo</b><![endif]>'
  499. '<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>')
  500. # According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
  501. # and "8.2.4.45 Markup declaration open state", comment tokens should
  502. # be emitted instead of 'unknown decl', but calling unknown_decl
  503. # provides more flexibility.
  504. # See also Lib/_markupbase.py:parse_declaration
  505. expected = [
  506. ('unknown decl', 'if !(IE)'),
  507. ('data', 'broken condcom'),
  508. ('unknown decl', 'endif'),
  509. ('unknown decl', 'if ! IE'),
  510. ('startendtag', 'link', [('href', 'favicon.tiff')]),
  511. ('unknown decl', 'endif'),
  512. ('unknown decl', 'if !IE 6'),
  513. ('startendtag', 'img', [('src', 'firefox.png')]),
  514. ('unknown decl', 'endif'),
  515. ('unknown decl', 'if !ie 6'),
  516. ('starttag', 'b', []),
  517. ('data', 'foo'),
  518. ('endtag', 'b'),
  519. ('unknown decl', 'endif'),
  520. ('unknown decl', 'if (!IE)|(lt IE 9)'),
  521. ('startendtag', 'img', [('src', 'mammoth.bmp')]),
  522. ('unknown decl', 'endif')
  523. ]
  524. self._run_check(html, expected)
  525. def test_convert_charrefs_dropped_text(self):
  526. # #23144: make sure that all the events are triggered when
  527. # convert_charrefs is True, even if we don't call .close()
  528. parser = EventCollector(convert_charrefs=True)
  529. # before the fix, bar & baz was missing
  530. parser.feed("foo <a>link</a> bar &amp; baz")
  531. self.assertEqual(
  532. parser.get_events(),
  533. [('data', 'foo '), ('starttag', 'a', []), ('data', 'link'),
  534. ('endtag', 'a'), ('data', ' bar & baz')]
  535. )
  536. class AttributesTestCase(TestCaseBase):
  537. def test_attr_syntax(self):
  538. output = [
  539. ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
  540. ]
  541. self._run_check("""<a b='v' c="v" d=v e>""", output)
  542. self._run_check("""<a b = 'v' c = "v" d = v e>""", output)
  543. self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
  544. self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
  545. def test_attr_values(self):
  546. self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
  547. [("starttag", "a", [("b", "xxx\n\txxx"),
  548. ("c", "yyy\t\nyyy"),
  549. ("d", "\txyz\n")])])
  550. self._run_check("""<a b='' c="">""",
  551. [("starttag", "a", [("b", ""), ("c", "")])])
  552. # Regression test for SF patch #669683.
  553. self._run_check("<e a=rgb(1,2,3)>",
  554. [("starttag", "e", [("a", "rgb(1,2,3)")])])
  555. # Regression test for SF bug #921657.
  556. self._run_check(
  557. "<a href=mailto:xyz@example.com>",
  558. [("starttag", "a", [("href", "mailto:xyz@example.com")])])
  559. def test_attr_nonascii(self):
  560. # see issue 7311
  561. self._run_check(
  562. "<img src=/foo/bar.png alt=\u4e2d\u6587>",
  563. [("starttag", "img", [("src", "/foo/bar.png"),
  564. ("alt", "\u4e2d\u6587")])])
  565. self._run_check(
  566. "<a title='\u30c6\u30b9\u30c8' href='\u30c6\u30b9\u30c8.html'>",
  567. [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
  568. ("href", "\u30c6\u30b9\u30c8.html")])])
  569. self._run_check(
  570. '<a title="\u30c6\u30b9\u30c8" href="\u30c6\u30b9\u30c8.html">',
  571. [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
  572. ("href", "\u30c6\u30b9\u30c8.html")])])
  573. def test_attr_entity_replacement(self):
  574. self._run_check(
  575. "<a b='&amp;&gt;&lt;&quot;&apos;'>",
  576. [("starttag", "a", [("b", "&><\"'")])])
  577. def test_attr_funky_names(self):
  578. self._run_check(
  579. "<a a.b='v' c:d=v e-f=v>",
  580. [("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])])
  581. def test_entityrefs_in_attributes(self):
  582. self._run_check(
  583. "<html foo='&euro;&amp;&#97;&#x61;&unsupported;'>",
  584. [("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])])
  585. def test_attr_funky_names2(self):
  586. self._run_check(
  587. r"<a $><b $=%><c \=/>",
  588. [("starttag", "a", [("$", None)]),
  589. ("starttag", "b", [("$", "%")]),
  590. ("starttag", "c", [("\\", "/")])])
  591. def test_entities_in_attribute_value(self):
  592. # see #1200313
  593. for entity in ['&', '&amp;', '&#38;', '&#x26;']:
  594. self._run_check('<a href="%s">' % entity,
  595. [("starttag", "a", [("href", "&")])])
  596. self._run_check("<a href='%s'>" % entity,
  597. [("starttag", "a", [("href", "&")])])
  598. self._run_check("<a href=%s>" % entity,
  599. [("starttag", "a", [("href", "&")])])
  600. def test_malformed_attributes(self):
  601. # see #13357
  602. html = (
  603. "<a href=test'style='color:red;bad1'>test - bad1</a>"
  604. "<a href=test'+style='color:red;ba2'>test - bad2</a>"
  605. "<a href=test'&nbsp;style='color:red;bad3'>test - bad3</a>"
  606. "<a href = test'&nbsp;style='color:red;bad4' >test - bad4</a>"
  607. )
  608. expected = [
  609. ('starttag', 'a', [('href', "test'style='color:red;bad1'")]),
  610. ('data', 'test - bad1'), ('endtag', 'a'),
  611. ('starttag', 'a', [('href', "test'+style='color:red;ba2'")]),
  612. ('data', 'test - bad2'), ('endtag', 'a'),
  613. ('starttag', 'a', [('href', "test'\xa0style='color:red;bad3'")]),
  614. ('data', 'test - bad3'), ('endtag', 'a'),
  615. ('starttag', 'a', [('href', "test'\xa0style='color:red;bad4'")]),
  616. ('data', 'test - bad4'), ('endtag', 'a')
  617. ]
  618. self._run_check(html, expected)
  619. def test_malformed_adjacent_attributes(self):
  620. # see #12629
  621. self._run_check('<x><y z=""o"" /></x>',
  622. [('starttag', 'x', []),
  623. ('startendtag', 'y', [('z', ''), ('o""', None)]),
  624. ('endtag', 'x')])
  625. self._run_check('<x><y z="""" /></x>',
  626. [('starttag', 'x', []),
  627. ('startendtag', 'y', [('z', ''), ('""', None)]),
  628. ('endtag', 'x')])
  629. # see #755670 for the following 3 tests
  630. def test_adjacent_attributes(self):
  631. self._run_check('<a width="100%"cellspacing=0>',
  632. [("starttag", "a",
  633. [("width", "100%"), ("cellspacing","0")])])
  634. self._run_check('<a id="foo"class="bar">',
  635. [("starttag", "a",
  636. [("id", "foo"), ("class","bar")])])
  637. def test_missing_attribute_value(self):
  638. self._run_check('<a v=>',
  639. [("starttag", "a", [("v", "")])])
  640. def test_javascript_attribute_value(self):
  641. self._run_check("<a href=javascript:popup('/popup/help.html')>",
  642. [("starttag", "a",
  643. [("href", "javascript:popup('/popup/help.html')")])])
  644. def test_end_tag_in_attribute_value(self):
  645. # see #1745761
  646. self._run_check("<a href='http://www.example.org/\">;'>spam</a>",
  647. [("starttag", "a",
  648. [("href", "http://www.example.org/\">;")]),
  649. ("data", "spam"), ("endtag", "a")])
  650. def test_with_unquoted_attributes(self):
  651. # see #12008
  652. html = ("<html><body bgcolor=d0ca90 text='181008'>"
  653. "<table cellspacing=0 cellpadding=1 width=100% ><tr>"
  654. "<td align=left><font size=-1>"
  655. "- <a href=/rabota/><span class=en> software-and-i</span></a>"
  656. "- <a href='/1/'><span class=en> library</span></a></table>")
  657. expected = [
  658. ('starttag', 'html', []),
  659. ('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]),
  660. ('starttag', 'table',
  661. [('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]),
  662. ('starttag', 'tr', []),
  663. ('starttag', 'td', [('align', 'left')]),
  664. ('starttag', 'font', [('size', '-1')]),
  665. ('data', '- '), ('starttag', 'a', [('href', '/rabota/')]),
  666. ('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'),
  667. ('endtag', 'span'), ('endtag', 'a'),
  668. ('data', '- '), ('starttag', 'a', [('href', '/1/')]),
  669. ('starttag', 'span', [('class', 'en')]), ('data', ' library'),
  670. ('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table')
  671. ]
  672. self._run_check(html, expected)
  673. def test_comma_between_attributes(self):
  674. # see bpo 41478
  675. # HTMLParser preserves duplicate attributes, leaving the task of
  676. # removing duplicate attributes to a conformant html tree builder
  677. html = ('<div class=bar,baz=asd>' # between attrs (unquoted)
  678. '<div class="bar",baz="asd">' # between attrs (quoted)
  679. '<div class=bar, baz=asd,>' # after values (unquoted)
  680. '<div class="bar", baz="asd",>' # after values (quoted)
  681. '<div class="bar",>' # one comma values (quoted)
  682. '<div class=,bar baz=,asd>' # before values (unquoted)
  683. '<div class=,"bar" baz=,"asd">' # before values (quoted)
  684. '<div ,class=bar ,baz=asd>' # before names
  685. '<div class,="bar" baz,="asd">' # after names
  686. )
  687. expected = [
  688. ('starttag', 'div', [('class', 'bar,baz=asd'),]),
  689. ('starttag', 'div', [('class', 'bar'), (',baz', 'asd')]),
  690. ('starttag', 'div', [('class', 'bar,'), ('baz', 'asd,')]),
  691. ('starttag', 'div', [('class', 'bar'), (',', None),
  692. ('baz', 'asd'), (',', None)]),
  693. ('starttag', 'div', [('class', 'bar'), (',', None)]),
  694. ('starttag', 'div', [('class', ',bar'), ('baz', ',asd')]),
  695. ('starttag', 'div', [('class', ',"bar"'), ('baz', ',"asd"')]),
  696. ('starttag', 'div', [(',class', 'bar'), (',baz', 'asd')]),
  697. ('starttag', 'div', [('class,', 'bar'), ('baz,', 'asd')]),
  698. ]
  699. self._run_check(html, expected)
  700. def test_weird_chars_in_unquoted_attribute_values(self):
  701. self._run_check('<form action=bogus|&#()value>', [
  702. ('starttag', 'form',
  703. [('action', 'bogus|&#()value')])])
  704. if __name__ == "__main__":
  705. unittest.main()